From 1ca750471a89d183ad20610cd0fe4db1e8d0f25b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 10 Apr 2016 11:28:20 +0200 Subject: [PATCH 01/70] added cholesky benchmarks to Makefile for ESSL --- benchmark/Makefile | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 8166f3863..0a1359254 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -261,7 +261,8 @@ endif essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ - slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ + scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ @@ -393,6 +394,9 @@ scholesky.mkl : scholesky.$(SUFFIX) scholesky.veclib : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +scholesky.essl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dcholesky ################################################### dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) @@ -410,6 +414,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX) dcholesky.veclib : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dcholesky.essl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ccholesky ################################################### ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) @@ -427,6 +434,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX) ccholesky.veclib : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ccholesky.essl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zcholesky ################################################### @@ -445,6 +455,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX) zcholesky.veclib : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zcholesky.essl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgemm #################################################### sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm From 8037d78eed1c1685a5ded60c388756c2eda6357a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 11 Apr 2016 11:21:36 +0200 Subject: [PATCH 02/70] bugfix for arm scal.c and zscal.c --- kernel/arm/scal.c | 4 ++++ kernel/arm/zscal.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c index 91ca76569..4ef49e293 100644 --- a/kernel/arm/scal.c +++ b/kernel/arm/scal.c @@ -40,6 +40,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { BLASLONG i=0,j=0; + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + while(j < n) { diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index f543edc04..0521aaa0b 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -43,6 +43,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F BLASLONG ip = 0; FLOAT temp; + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + inc_x2 = 2 * inc_x; for ( i=0; i Date: Tue, 12 Apr 2016 15:32:10 -0400 Subject: [PATCH 03/70] Bump to 0.2.19.dev. --- CMakeLists.txt | 2 +- Makefile.rule | 2 +- appveyor.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ead63bff8..f5dfb8187 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 18) +set(OpenBLAS_PATCH_VERSION 19.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) diff --git a/Makefile.rule b/Makefile.rule index d8db6102c..55c335311 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.18 +VERSION = 0.2.19.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/appveyor.yml b/appveyor.yml index 5360a9ef9..c9d8e47ac 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.2.18.{build} +version: 0.2.19.{build} #environment: From 3c6294ca3d4ff28cc4f3cba7bf98ce9e175bcb7c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 19 Apr 2016 16:08:54 +0200 Subject: [PATCH 04/70] added optimized sgemm_tcopy for power8 --- Makefile.power | 1 - kernel/power/KERNEL.POWER8 | 2 +- kernel/power/sgemm_tcopy_16_power8.S | 176 +++++++++ kernel/power/sgemm_tcopy_logic_16_power8.S | 268 ++++++++++++++ kernel/power/sgemm_tcopy_macros_16_power8.S | 381 ++++++++++++++++++++ param.h | 7 +- 6 files changed, 827 insertions(+), 8 deletions(-) create mode 100644 kernel/power/sgemm_tcopy_16_power8.S create mode 100644 kernel/power/sgemm_tcopy_logic_16_power8.S create mode 100644 kernel/power/sgemm_tcopy_macros_16_power8.S diff --git a/Makefile.power b/Makefile.power index 7e2b47386..cc138e60a 100644 --- a/Makefile.power +++ b/Makefile.power @@ -1,4 +1,3 @@ -# CCOMMON_OPT += -DALLOC_SHM FLAMEPATH = $(HOME)/flame/lib diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index b37a4213b..fa5cc5e63 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -10,7 +10,7 @@ ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c -SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMMINCOPYOBJ = sgemm_incopy.o diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S new file mode 100644 index 000000000..d690aab8e --- /dev/null +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -0,0 +1,176 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define B16 r29 +#define M16 r30 +#define T1 r31 + +#define o0 0 + +#include "sgemm_tcopy_macros_16_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + slwi M16, M, 4 + BASE_SHIFT + + li T1, -16 + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B8, N, T1 + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B8, B8, M + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B8, B8, BASE_SHIFT + slwi B4, B4, BASE_SHIFT + slwi B2, B2, BASE_SHIFT + slwi B1, B1, BASE_SHIFT + + add B8, B8, B + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + li PREB, 384 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "sgemm_tcopy_logic_16_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/sgemm_tcopy_logic_16_power8.S b/kernel/power/sgemm_tcopy_logic_16_power8.S new file mode 100644 index 000000000..c0969eec6 --- /dev/null +++ b/kernel/power/sgemm_tcopy_logic_16_power8.S @@ -0,0 +1,268 @@ + srawi. I, M, 2 + ble SCOPYT_L2_BEGIN + + +SCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B16, B + addi B, B, 64*SIZE + + sradi. J, N, 4 + ble SCOPYT_L4x8_BEGIN + + mr BO, B16 + +SCOPYT_L4x16_LOOP: + + COPY_4x16 + + addi A0, A0, 16*SIZE + addi A1, A1, 16*SIZE + addi A2, A2, 16*SIZE + addi A3, A3, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + bgt SCOPYT_L4x16_LOOP + +SCOPYT_L4x8_BEGIN: + + andi. T1, N, 8 + ble SCOPYT_L4x4_BEGIN + + mr BO, B8 + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + + addi B8, B8, 32*SIZE + +SCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + addi A2, A2, 4*SIZE + addi A3, A3, 4*SIZE + + addi B4, B4, 16*SIZE + +SCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + addi A2, A2, 2*SIZE + addi A3, A3, 2*SIZE + + addi B2, B2, 8*SIZE + +SCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + addi A2, A2, 1*SIZE + addi A3, A3, 1*SIZE + + addi B1, B1, 4*SIZE + +SCOPYT_L4_END: + + addic. I, I, -1 + bgt SCOPYT_L4_BEGIN + + + +SCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble SCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B16, B + addi B, B, 32*SIZE + + sradi. J, N, 4 + ble SCOPYT_L2x8_BEGIN + + mr BO, B16 + +SCOPYT_L2x16_LOOP: + + COPY_2x16 + + addi A0, A0, 16*SIZE + addi A1, A1, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + bgt SCOPYT_L2x16_LOOP + +SCOPYT_L2x8_BEGIN: + + andi. T1, N, 8 + ble SCOPYT_L2x4_BEGIN + + mr BO, B8 + + COPY_2x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + + addi B8, B8, 16*SIZE + +SCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + + addi B4, B4, 8*SIZE + +SCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + + addi B2, B2, 4*SIZE + +SCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + + addi B1, B1, 2*SIZE + +SCOPYT_L2_END: + + +SCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B16, B + addi B, B, 16*SIZE + + sradi. J, N, 4 + ble SCOPYT_L1x8_BEGIN + + mr BO, B16 + +SCOPYT_L1x16_LOOP: + + COPY_1x16 + + addi A0, A0, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + bgt SCOPYT_L1x16_LOOP + +SCOPYT_L1x8_BEGIN: + + andi. T1, N, 8 + ble SCOPYT_L1x4_BEGIN + + mr BO, B8 + + COPY_1x8 + + addi A0, A0, 8*SIZE + + addi B8, B8, 8*SIZE + +SCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + addi A0, A0, 4*SIZE + + addi B4, B4, 4*SIZE + +SCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + addi A0, A0, 2*SIZE + + addi B2, B2, 2*SIZE + +SCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + addi A0, A0, 1*SIZE + + addi B1, B1, 1*SIZE + +SCOPYT_L1_END: + diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S new file mode 100644 index 000000000..36c4593b5 --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -0,0 +1,381 @@ + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + + lxvw4x vs40, o0, A2 + lxvw4x vs41, o16, A2 + lxvw4x vs42, o32, A2 + lxvw4x vs43, o48, A2 + + lxvw4x vs44, o0, A3 + lxvw4x vs45, o16, A3 + lxvw4x vs46, o32, A3 + lxvw4x vs47, o48, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs40, o0, T1 + stxvw4x vs41, o16, T1 + stxvw4x vs42, o32, T1 + stxvw4x vs43, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs44, o0, T1 + stxvw4x vs45, o16, T1 + stxvw4x vs46, o32, T1 + stxvw4x vs47, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + lxvw4x vs34, o0, A2 + + lxvw4x vs35, o0, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + lxsspx vs34, o0, A2 + + lxsspx vs35, o0, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + +.endm + diff --git a/param.h b/param.h index a6ead4b64..8ecc812dc 100644 --- a/param.h +++ b/param.h @@ -1964,7 +1964,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 4096 +#define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 4096 #define GEMM_DEFAULT_ALIGN 0x03fffUL @@ -1987,11 +1987,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 720 -#define SGEMM_DEFAULT_R 21600 -#define DGEMM_DEFAULT_R 14400 -#define CGEMM_DEFAULT_R 16200 -#define ZGEMM_DEFAULT_R 21600 - #define SYMV_P 8 #endif From 0001260f4bc4c8bb5509ec29e002ad6f42e84345 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 20 Apr 2016 13:06:38 +0200 Subject: [PATCH 05/70] optimized sgemm --- kernel/power/sgemm_logic_16x8_power8.S | 119 ++++++-------- kernel/power/sgemm_macros_16x8_power8.S | 177 +++++++++++++++++---- kernel/power/sgemm_tcopy_16_power8.S | 2 +- kernel/power/sgemm_tcopy_logic_16_power8.S | 20 +++ 4 files changed, 210 insertions(+), 108 deletions(-) diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S index 06bb79ea3..6ba999024 100644 --- a/kernel/power/sgemm_logic_16x8_power8.S +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -1,38 +1,3 @@ -/*************************************************************************** -Copyright (c) 2013-2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2016/04/02 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - srawi. J, N, 3 ble SGEMM_L8_END @@ -40,35 +5,48 @@ SGEMM_L8_BEGIN: mr BO, B mr BBO, BBUFFER - slwi T1, K, 3 + srawi. T1, K, 2 + ble SGEMM_L8_COPYB1 + + +SGEMM_L8_COPYB4: -SGEMM_L8_COPYB: + dcbt BO, PRE dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + ble SGEMM_L8_COPYB1 - lxvw4x vs3, o0, BO - lxvw4x vs11, o16, BO - xxspltw vs4, vs3, 0 - xxspltw vs5, vs3, 1 - xxspltw vs6, vs3, 2 - xxspltw vs7, vs3, 3 - xxspltw vs12, vs11, 0 - xxspltw vs13, vs11, 1 - xxspltw vs14, vs11, 2 - xxspltw vs15, vs11, 3 - stxvw4x vs4, o0, BBO - stxvw4x vs5, o16, BBO - stxvw4x vs6, o32, BBO - stxvw4x vs7, o48, BBO - addi BO, BO, 32 - addi BBO, BBO, 64 - stxvw4x vs12, o0, BBO - stxvw4x vs13, o16, BBO - stxvw4x vs14, o32, BBO - stxvw4x vs15, o48, BBO - addic. T1, T1, -8 - addi BBO, BBO, 64 + dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + ble SGEMM_L8_COPYB1 + + dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + ble SGEMM_L8_COPYB1 - bge SGEMM_L8_COPYB + dcbtst BBO, PRE + COPYB_4x8 + addic. T1, T1, -1 + + bgt SGEMM_L8_COPYB4 + +SGEMM_L8_COPYB1: + + andi. T1, K, 3 + ble SGEMM_L8_COPYB_END + +SGEMM_L8_COPYB1_LOOP: + + + COPYB_1x8 + addic. T1, T1, -1 + + bgt SGEMM_L8_COPYB1_LOOP + +SGEMM_L8_COPYB_END: mr CO, C mr AO, A @@ -93,24 +71,24 @@ SGEMM_L8x16_LOOP_START: LOAD8x16_1 dcbt BO, PRE KERNEL8x16_I1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 addic. L, L, -2 @@ -122,24 +100,24 @@ SGEMM_L8x16_LOOP: dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 addic. L, L, -1 @@ -149,18 +127,15 @@ SGEMM_L8x16_LOOP_END: dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE + dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 - dcbt BO, PRE KERNEL8x16_1 - dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 KERNEL8x16_1 diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 71dc52979..94e7d7d02 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -1,38 +1,3 @@ -/*************************************************************************** -Copyright (c) 2013-2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2016/04/02 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - /********************************************************************************************** * Macros for N=8 and M=16 @@ -5886,3 +5851,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + + + + +.macro COPYB_4x8 + + + lxvw4x vs5, o0, BO + xxspltw vs6, vs5, 0 + xxspltw vs7, vs5, 1 + xxspltw vs8, vs5, 2 + xxspltw vs9, vs5, 3 + + lxvw4x vs10, o16, BO + xxspltw vs11, vs10, 0 + xxspltw vs12, vs10, 1 + xxspltw vs13, vs10, 2 + xxspltw vs14, vs10, 3 + + lxvw4x vs15, o32, BO + xxspltw vs16, vs15, 0 + xxspltw vs17, vs15, 1 + xxspltw vs18, vs15, 2 + xxspltw vs19, vs15, 3 + + lxvw4x vs20, o48, BO + xxspltw vs21, vs20, 0 + xxspltw vs22, vs20, 1 + xxspltw vs23, vs20, 2 + xxspltw vs24, vs20, 3 + + addi BO, BO, 64 + lxvw4x vs35, o0, BO + xxspltw vs36, vs35, 0 + xxspltw vs37, vs35, 1 + xxspltw vs38, vs35, 2 + xxspltw vs39, vs35, 3 + + lxvw4x vs40, o16, BO + xxspltw vs41, vs40, 0 + xxspltw vs42, vs40, 1 + xxspltw vs43, vs40, 2 + xxspltw vs44, vs40, 3 + + lxvw4x vs45, o32, BO + xxspltw vs46, vs45, 0 + xxspltw vs47, vs45, 1 + xxspltw vs48, vs45, 2 + xxspltw vs49, vs45, 3 + + lxvw4x vs50, o48, BO + xxspltw vs51, vs50, 0 + xxspltw vs52, vs50, 1 + xxspltw vs53, vs50, 2 + xxspltw vs54, vs50, 3 + + addi BO, BO, 64 + + + stxvw4x vs6, o0, BBO + stxvw4x vs7, o16, BBO + stxvw4x vs8, o32, BBO + stxvw4x vs9, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs11, o0, BBO + stxvw4x vs12, o16, BBO + stxvw4x vs13, o32, BBO + stxvw4x vs14, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs16, o0, BBO + stxvw4x vs17, o16, BBO + stxvw4x vs18, o32, BBO + stxvw4x vs19, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs21, o0, BBO + stxvw4x vs22, o16, BBO + stxvw4x vs23, o32, BBO + stxvw4x vs24, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs36, o0, BBO + stxvw4x vs37, o16, BBO + stxvw4x vs38, o32, BBO + stxvw4x vs39, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs41, o0, BBO + stxvw4x vs42, o16, BBO + stxvw4x vs43, o32, BBO + stxvw4x vs44, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs46, o0, BBO + stxvw4x vs47, o16, BBO + stxvw4x vs48, o32, BBO + stxvw4x vs49, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs51, o0, BBO + stxvw4x vs52, o16, BBO + stxvw4x vs53, o32, BBO + stxvw4x vs54, o48, BBO + + addi BBO, BBO, 64 +.endm + + +.macro COPYB_1x8 + + + lxvw4x vs5, o0, BO + xxspltw vs6, vs5, 0 + xxspltw vs7, vs5, 1 + xxspltw vs8, vs5, 2 + xxspltw vs9, vs5, 3 + + lxvw4x vs10, o16, BO + xxspltw vs11, vs10, 0 + xxspltw vs12, vs10, 1 + xxspltw vs13, vs10, 2 + xxspltw vs14, vs10, 3 + + + addi BO, BO, 32 + + stxvw4x vs6, o0, BBO + stxvw4x vs7, o16, BBO + stxvw4x vs8, o32, BBO + stxvw4x vs9, o48, BBO + + addi BBO, BBO, 64 + stxvw4x vs11, o0, BBO + stxvw4x vs12, o16, BBO + stxvw4x vs13, o32, BBO + stxvw4x vs14, o48, BBO + + addi BBO, BBO, 64 +.endm + diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index d690aab8e..c31784d6f 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -136,7 +136,7 @@ add B1, B1, B li PREA, 384 - li PREB, 384 + addi PREB, M16, 128 li o4, 4 li o16, 16 diff --git a/kernel/power/sgemm_tcopy_logic_16_power8.S b/kernel/power/sgemm_tcopy_logic_16_power8.S index c0969eec6..5a715c8dd 100644 --- a/kernel/power/sgemm_tcopy_logic_16_power8.S +++ b/kernel/power/sgemm_tcopy_logic_16_power8.S @@ -19,6 +19,26 @@ SCOPYT_L4_BEGIN: SCOPYT_L4x16_LOOP: + dcbtst BO, M16 + dcbtst BO, PREB + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x16 + + addi A0, A0, 16*SIZE + addi A1, A1, 16*SIZE + addi A2, A2, 16*SIZE + addi A3, A3, 16*SIZE + add BO, BO, M16 + + addic. J, J, -1 + ble SCOPYT_L4x8_BEGIN + + + dcbtst BO, M16 + dcbtst BO, PREB COPY_4x16 addi A0, A0, 16*SIZE From 391584af858b25dfb240ea92d1adc526af03773b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 20 Apr 2016 15:28:28 +0200 Subject: [PATCH 06/70] optimized Makefile.power for POWER8 --- Makefile.power | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.power b/Makefile.power index cc138e60a..ff7d8edb1 100644 --- a/Makefile.power +++ b/Makefile.power @@ -1,3 +1,7 @@ +ifeq ($(CORE), POWER8) +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +endif FLAMEPATH = $(HOME)/flame/lib From 9276c9012ffea6fef81aaa7349596967b5564a64 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 21 Apr 2016 11:37:57 +0200 Subject: [PATCH 07/70] Optimized sgemm and dgemm and tested again. --- Makefile.rule | 3 + common_power.h | 5 + kernel/power/KERNEL.POWER8 | 2 +- kernel/power/dgemm_kernel_16x4_power8.S | 2 +- kernel/power/dgemm_tcopy_16_power8.S | 211 +++++++ kernel/power/dgemm_tcopy_logic_16_power8.S | 281 +++++++++ kernel/power/dgemm_tcopy_macros_16_power8.S | 612 ++++++++++++++++++++ kernel/power/sgemm_kernel_16x8_power8.S | 2 +- kernel/power/sgemm_logic_16x8_power8.S | 35 ++ kernel/power/sgemm_macros_16x8_power8.S | 35 ++ kernel/power/sgemm_tcopy_16_power8.S | 38 +- kernel/power/sgemm_tcopy_logic_16_power8.S | 36 ++ kernel/power/sgemm_tcopy_macros_16_power8.S | 35 ++ 13 files changed, 1293 insertions(+), 4 deletions(-) create mode 100644 kernel/power/dgemm_tcopy_16_power8.S create mode 100644 kernel/power/dgemm_tcopy_logic_16_power8.S create mode 100644 kernel/power/dgemm_tcopy_macros_16_power8.S diff --git a/Makefile.rule b/Makefile.rule index 55c335311..4b9d36bf6 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -52,6 +52,7 @@ VERSION = 0.2.19.dev # USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. +# always use this flag for POWER8 # USE_OPENMP = 1 # You can define maximum number of threads. Basically it should be @@ -153,10 +154,12 @@ NO_AFFINITY = 1 # Common Optimization Flag; # The default -O2 is enough. +# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # COMMON_OPT = -O2 # gfortran option for LAPACK # enable this flag only on 64bit Linux and if you need a thread safe lapack library +# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # FCOMMON_OPT = -frecursive # Profiling flags diff --git a/common_power.h b/common_power.h index 723d949f2..b62aca303 100644 --- a/common_power.h +++ b/common_power.h @@ -39,8 +39,13 @@ #ifndef COMMON_POWER #define COMMON_POWER +#if defined(POWER8) +#define MB __asm__ __volatile__ ("eieio":::"memory") +#define WMB __asm__ __volatile__ ("eieio":::"memory") +#else #define MB __asm__ __volatile__ ("sync") #define WMB __asm__ __volatile__ ("sync") +#endif #define INLINE inline diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fa5cc5e63..e1b89cc97 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -20,7 +20,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = dgemm_incopy.o diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index c67f31160..4c14b0c6f 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S new file mode 100644 index 000000000..f87af535d --- /dev/null +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o8 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define B16 r29 +#define M16 r30 +#define T1 r31 + +#define o0 0 + +#include "dgemm_tcopy_macros_16_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + slwi M16, M, 4 + BASE_SHIFT + + li T1, -16 + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B8, N, T1 + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B8, B8, M + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B8, B8, BASE_SHIFT + slwi B4, B4, BASE_SHIFT + slwi B2, B2, BASE_SHIFT + slwi B1, B1, BASE_SHIFT + + add B8, B8, B + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 768 + addi PREB, M16, 128 + + li o8, 8 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "dgemm_tcopy_logic_16_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S new file mode 100644 index 000000000..776cd3401 --- /dev/null +++ b/kernel/power/dgemm_tcopy_logic_16_power8.S @@ -0,0 +1,281 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble DCOPYT_L2_BEGIN + + +DCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B16, B + addi B, B, 64*SIZE + + sradi. J, N, 4 + ble DCOPYT_L4x8_BEGIN + + mr BO, B16 + + .align 5 + +DCOPYT_L4x16_LOOP: + + addi T1, PREB, 128 + addi T2, PREB, 256 + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M16 + dcbtst BO, PREB + dcbtst BO, T1 + dcbtst BO, T2 + COPY_4x16 + + add BO, BO, M16 + + addic. J, J, -1 + bgt DCOPYT_L4x16_LOOP + +DCOPYT_L4x8_BEGIN: + + andi. T1, N, 8 + ble DCOPYT_L4x4_BEGIN + + mr BO, B8 + + COPY_4x8 + + + addi B8, B8, 32*SIZE + +DCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble DCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 16*SIZE + +DCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble DCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 8*SIZE + +DCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble DCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 4*SIZE + +DCOPYT_L4_END: + + addic. I, I, -1 + bgt DCOPYT_L4_BEGIN + + + +DCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble DCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B16, B + addi B, B, 32*SIZE + + sradi. J, N, 4 + ble DCOPYT_L2x8_BEGIN + + mr BO, B16 + +DCOPYT_L2x16_LOOP: + + COPY_2x16 + + add BO, BO, M16 + + addic. J, J, -1 + bgt DCOPYT_L2x16_LOOP + +DCOPYT_L2x8_BEGIN: + + andi. T1, N, 8 + ble DCOPYT_L2x4_BEGIN + + mr BO, B8 + + COPY_2x8 + + + addi B8, B8, 16*SIZE + +DCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble DCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 8*SIZE + +DCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble DCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 4*SIZE + +DCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble DCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 2*SIZE + +DCOPYT_L2_END: + + +DCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B16, B + addi B, B, 16*SIZE + + sradi. J, N, 4 + ble DCOPYT_L1x8_BEGIN + + mr BO, B16 + +DCOPYT_L1x16_LOOP: + + COPY_1x16 + + add BO, BO, M16 + + addic. J, J, -1 + bgt DCOPYT_L1x16_LOOP + +DCOPYT_L1x8_BEGIN: + + andi. T1, N, 8 + ble DCOPYT_L1x4_BEGIN + + mr BO, B8 + + COPY_1x8 + + + addi B8, B8, 8*SIZE + +DCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble DCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 4*SIZE + +DCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble DCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 2*SIZE + +DCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble DCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 1*SIZE + +DCOPYT_L1_END: + diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S new file mode 100644 index 000000000..aef03d7cf --- /dev/null +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -0,0 +1,612 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs48, o0, A2 + lxvd2x vs49, o16, A2 + lxvd2x vs50, o32, A2 + lxvd2x vs51, o48, A2 + addi A2, A2, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs56, o0, A3 + lxvd2x vs57, o16, A3 + lxvd2x vs58, o32, A3 + lxvd2x vs59, o48, A3 + addi A3, A3, 64 + + lxvd2x vs60, o0, A3 + lxvd2x vs61, o16, A3 + lxvd2x vs62, o32, A3 + lxvd2x vs63, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 + addi T1, T1, 64 + + stxvd2x vs52, o0, T1 + stxvd2x vs53, o16, T1 + stxvd2x vs54, o32, T1 + stxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs56, o0, T1 + stxvd2x vs57, o16, T1 + stxvd2x vs58, o32, T1 + stxvd2x vs59, o48, T1 + addi T1, T1, 64 + + stxvd2x vs60, o0, T1 + stxvd2x vs61, o16, T1 + stxvd2x vs62, o32, T1 + stxvd2x vs63, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs40, o0, A2 + lxvd2x vs41, o16, A2 + lxvd2x vs42, o32, A2 + lxvd2x vs43, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs44, o0, A3 + lxvd2x vs45, o16, A3 + lxvd2x vs46, o32, A3 + lxvd2x vs47, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs36, o0, A2 + lxvd2x vs37, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs38, o0, A3 + lxvd2x vs39, o16, A3 + addi A3, A3, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs34, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs35, o0, A3 + addi A3, A3, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsdx vs32, o0, A0 + addi A0, A0, 8 + + + lxsdx vs33, o0, A1 + addi A1, A1, 8 + + + lxsdx vs34, o0, A2 + addi A2, A2, 8 + + + lxsdx vs35, o0, A3 + addi A3, A3, 8 + + + mr T1, BO + + stxsdx vs32, o0, T1 + + stxsdx vs33, o8, T1 + + addi T1, T1, 16 + + stxsdx vs34, o0, T1 + + stxsdx vs35, o8, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsdx vs32, o0, A0 + addi A0, A0, 8 + + + lxsdx vs33, o0, A1 + addi A1, A1, 8 + + + mr T1, BO + + stxsdx vs32, o0, T1 + + stxsdx vs33, o8, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsdx vs32, o0, A0 + addi A0, A0, 8 + + + mr T1, BO + + stxsdx vs32, o0, T1 + +.endm + diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index 77f3f7cfb..e169eb970 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S index 6ba999024..8907fe6ad 100644 --- a/kernel/power/sgemm_logic_16x8_power8.S +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + srawi. J, N, 3 ble SGEMM_L8_END diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 94e7d7d02..98414857f 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /********************************************************************************************** * Macros for N=8 and M=16 diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index c31784d6f..764d5b187 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -135,7 +171,7 @@ add B2, B2, B add B1, B1, B - li PREA, 384 + li PREA, 768 addi PREB, M16, 128 li o4, 4 diff --git a/kernel/power/sgemm_tcopy_logic_16_power8.S b/kernel/power/sgemm_tcopy_logic_16_power8.S index 5a715c8dd..7dfb6fa46 100644 --- a/kernel/power/sgemm_tcopy_logic_16_power8.S +++ b/kernel/power/sgemm_tcopy_logic_16_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. I, M, 2 ble SCOPYT_L2_BEGIN diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index 36c4593b5..53f9c8b82 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /********************************************************************************************** * Macros for N=4 and M=16 From dd2b897795e1047ec099e9af66a25d243a55f5fa Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 21 Apr 2016 12:54:32 +0200 Subject: [PATCH 08/70] added bugfixes for some make files and smallscaling.c --- Makefile.power | 19 +++++++++++++++++++ Makefile.rule | 2 +- benchmark/Makefile | 4 ++-- benchmark/smallscaling.c | 1 + 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/Makefile.power b/Makefile.power index ff7d8edb1..48bcb77f8 100644 --- a/Makefile.power +++ b/Makefile.power @@ -1,7 +1,26 @@ + +ifdef USE_THREAD +ifeq ($(USE_THREAD), 0) +USE_OPENMP = 0 +else +USE_OPENMP = 1 +endif +else +USE_OPENMP = 1 +endif + + + ifeq ($(CORE), POWER8) +ifeq ($(USE_OPENMP), 1) +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math endif +endif + FLAMEPATH = $(HOME)/flame/lib diff --git a/Makefile.rule b/Makefile.rule index 4b9d36bf6..2d27237de 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -52,7 +52,7 @@ VERSION = 0.2.19.dev # USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. -# always use this flag for POWER8 +# This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 # You can define maximum number of threads. Basically it should be diff --git a/benchmark/Makefile b/benchmark/Makefile index 0a1359254..38ccb8f44 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -2231,10 +2231,10 @@ zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ smallscaling: smallscaling.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling include $(TOPDIR)/Makefile.tail diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c index 9068c61b1..c5dcc5881 100644 --- a/benchmark/smallscaling.c +++ b/benchmark/smallscaling.c @@ -5,6 +5,7 @@ #include #include #include +#include #define MIN_SIZE 5 #define MAX_SIZE 60 #define NB_SIZE 10 From 2c3dfe2bf307e595dd17f4f9f5fd260b8c298c1b Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Fri, 22 Apr 2016 14:03:18 +0530 Subject: [PATCH 09/70] MIPS P5600(32 bit) and I6400(64 bit) cores support added. Seperated mips and mips64 files. Configurations support for mips 32 bit. Signed-off-by: Shivraj Patil --- Makefile.mips | 3 + Makefile.system | 18 +-- TargetList.txt | 14 ++- c_check | 8 +- common.h | 4 + common_mips.h | 109 ++++++++++++++++++ common_mips64.h | 10 +- cpuid_mips.c | 60 +++------- cpuid_mips64.c | 223 ++++++++++++++++++++++++++++++++++++ ctest.c | 2 +- getarch.c | 34 ++++++ kernel/mips/KERNEL | 46 ++++++++ kernel/mips/KERNEL.P5600 | 130 +++++++++++++++++++++ kernel/mips/Makefile | 2 + kernel/mips/amax.c | 66 +++++++++++ kernel/mips/amin.c | 66 +++++++++++ kernel/mips/asum.c | 57 +++++++++ kernel/mips/axpby.c | 95 +++++++++++++++ kernel/mips/axpy.c | 54 +++++++++ kernel/mips/copy.c | 50 ++++++++ kernel/mips/dot.c | 55 +++++++++ kernel/mips/gemv_n.c | 56 +++++++++ kernel/mips/gemv_t.c | 58 ++++++++++ kernel/mips/iamax.c | 68 +++++++++++ kernel/mips/iamin.c | 68 +++++++++++ kernel/mips/imax.c | 59 ++++++++++ kernel/mips/imin.c | 59 ++++++++++ kernel/mips/izamax.c | 72 ++++++++++++ kernel/mips/izamin.c | 72 ++++++++++++ kernel/mips/max.c | 65 +++++++++++ kernel/mips/min.c | 65 +++++++++++ kernel/mips/nrm2.c | 88 ++++++++++++++ kernel/mips/omatcopy_cn.c | 82 +++++++++++++ kernel/mips/omatcopy_ct.c | 81 +++++++++++++ kernel/mips/omatcopy_rn.c | 82 +++++++++++++ kernel/mips/omatcopy_rt.c | 54 +++++++++ kernel/mips/rot.c | 53 +++++++++ kernel/mips/scal.c | 50 ++++++++ kernel/mips/swap.c | 55 +++++++++ kernel/mips/symv_L.c | 70 +++++++++++ kernel/mips/symv_U.c | 71 ++++++++++++ kernel/mips/zamax.c | 70 +++++++++++ kernel/mips/zamin.c | 70 +++++++++++ kernel/mips/zasum.c | 62 ++++++++++ kernel/mips/zaxpby.c | 113 ++++++++++++++++++ kernel/mips/zaxpy.c | 64 +++++++++++ kernel/mips/zcopy.c | 56 +++++++++ kernel/mips/zdot.c | 75 ++++++++++++ kernel/mips/zgemv_n.c | 147 ++++++++++++++++++++++++ kernel/mips/zgemv_t.c | 130 +++++++++++++++++++++ kernel/mips/znrm2.c | 97 ++++++++++++++++ kernel/mips/zomatcopy_cn.c | 62 ++++++++++ kernel/mips/zomatcopy_cnc.c | 61 ++++++++++ kernel/mips/zomatcopy_ct.c | 63 ++++++++++ kernel/mips/zomatcopy_ctc.c | 63 ++++++++++ kernel/mips/zomatcopy_rn.c | 62 ++++++++++ kernel/mips/zomatcopy_rnc.c | 61 ++++++++++ kernel/mips/zomatcopy_rt.c | 64 +++++++++++ kernel/mips/zomatcopy_rtc.c | 64 +++++++++++ kernel/mips/zrot.c | 61 ++++++++++ kernel/mips/zscal.c | 75 ++++++++++++ kernel/mips/zswap.c | 63 ++++++++++ kernel/mips64/KERNEL.I6400 | 1 + lapack/laswp/mips/Makefile | 13 +++ param.h | 77 +++++++++++++ 65 files changed, 4036 insertions(+), 72 deletions(-) create mode 100644 Makefile.mips create mode 100644 common_mips.h create mode 100644 cpuid_mips64.c create mode 100644 kernel/mips/KERNEL create mode 100644 kernel/mips/KERNEL.P5600 create mode 100644 kernel/mips/Makefile create mode 100644 kernel/mips/amax.c create mode 100644 kernel/mips/amin.c create mode 100644 kernel/mips/asum.c create mode 100644 kernel/mips/axpby.c create mode 100644 kernel/mips/axpy.c create mode 100644 kernel/mips/copy.c create mode 100644 kernel/mips/dot.c create mode 100644 kernel/mips/gemv_n.c create mode 100644 kernel/mips/gemv_t.c create mode 100644 kernel/mips/iamax.c create mode 100644 kernel/mips/iamin.c create mode 100644 kernel/mips/imax.c create mode 100644 kernel/mips/imin.c create mode 100644 kernel/mips/izamax.c create mode 100644 kernel/mips/izamin.c create mode 100644 kernel/mips/max.c create mode 100644 kernel/mips/min.c create mode 100644 kernel/mips/nrm2.c create mode 100644 kernel/mips/omatcopy_cn.c create mode 100644 kernel/mips/omatcopy_ct.c create mode 100644 kernel/mips/omatcopy_rn.c create mode 100644 kernel/mips/omatcopy_rt.c create mode 100644 kernel/mips/rot.c create mode 100644 kernel/mips/scal.c create mode 100644 kernel/mips/swap.c create mode 100644 kernel/mips/symv_L.c create mode 100644 kernel/mips/symv_U.c create mode 100644 kernel/mips/zamax.c create mode 100644 kernel/mips/zamin.c create mode 100644 kernel/mips/zasum.c create mode 100644 kernel/mips/zaxpby.c create mode 100644 kernel/mips/zaxpy.c create mode 100644 kernel/mips/zcopy.c create mode 100644 kernel/mips/zdot.c create mode 100644 kernel/mips/zgemv_n.c create mode 100644 kernel/mips/zgemv_t.c create mode 100644 kernel/mips/znrm2.c create mode 100644 kernel/mips/zomatcopy_cn.c create mode 100644 kernel/mips/zomatcopy_cnc.c create mode 100644 kernel/mips/zomatcopy_ct.c create mode 100644 kernel/mips/zomatcopy_ctc.c create mode 100644 kernel/mips/zomatcopy_rn.c create mode 100644 kernel/mips/zomatcopy_rnc.c create mode 100644 kernel/mips/zomatcopy_rt.c create mode 100644 kernel/mips/zomatcopy_rtc.c create mode 100644 kernel/mips/zrot.c create mode 100644 kernel/mips/zscal.c create mode 100644 kernel/mips/zswap.c create mode 100644 kernel/mips64/KERNEL.I6400 create mode 100644 lapack/laswp/mips/Makefile diff --git a/Makefile.mips b/Makefile.mips new file mode 100644 index 000000000..05ea9c679 --- /dev/null +++ b/Makefile.mips @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.system b/Makefile.system index b89f60e96..73361fed1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -462,7 +462,7 @@ endif endif endif -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) NO_BINARY_MODE = 1 endif @@ -502,11 +502,11 @@ endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifdef BINARY64 CCOMMON_OPT += -mabi=64 else -CCOMMON_OPT += -mabi=n32 +CCOMMON_OPT += -mabi=32 endif BINARY_DEFINED = 1 endif @@ -589,11 +589,11 @@ ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifdef BINARY64 FCOMMON_OPT += -mabi=64 else -FCOMMON_OPT += -mabi=n32 +FCOMMON_OPT += -mabi=32 endif endif else @@ -678,7 +678,7 @@ endif endif endif -ifneq ($(ARCH), mips64) +ifeq ($(filter $(ARCH),mips64 mips)) ifndef BINARY64 FCOMMON_OPT += -m32 else @@ -688,7 +688,7 @@ else ifdef BINARY64 FCOMMON_OPT += -mabi=64 else -FCOMMON_OPT += -mabi=n32 +FCOMMON_OPT += -mabi=32 endif endif @@ -707,7 +707,7 @@ endif endif endif -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifndef BINARY64 FCOMMON_OPT += -n32 else @@ -737,7 +737,7 @@ endif ifeq ($(C_COMPILER), OPEN64) -ifeq ($(ARCH), mips64) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifndef BINARY64 CCOMMON_OPT += -n32 else diff --git a/TargetList.txt b/TargetList.txt index dc1e08722..248f643a7 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -53,26 +53,30 @@ PPC440 PPC440FP2 CELL -3.MIPS64 CPU: +3.MIPS CPU: +P5600 + +4.MIPS64 CPU: SICORTEX LOONGSON3A LOONGSON3B +I6400 -4.IA64 CPU: +5.IA64 CPU: ITANIUM2 -5.SPARC CPU: +6.SPARC CPU: SPARC SPARCV7 -6.ARM CPU: +7.ARM CPU: CORTEXA15 CORTEXA9 ARMV7 ARMV6 ARMV5 -7.ARM 64-bit CPU: +8.ARM 64-bit CPU: ARMV8 CORTEXA57 diff --git a/c_check b/c_check index bcf4c2cb3..d624472dc 100644 --- a/c_check +++ b/c_check @@ -63,7 +63,7 @@ $os = Android if ($data =~ /OS_ANDROID/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); @@ -79,8 +79,8 @@ if ($os eq "AIX") { $defined = 1; } -if (($architecture eq "mips32") || ($architecture eq "mips64")) { - $compiler_name .= " -mabi=n32" if ($binary eq "32"); +if (($architecture eq "mips") || ($architecture eq "mips64")) { + $compiler_name .= " -mabi=32" if ($binary eq "32"); $compiler_name .= " -mabi=64" if ($binary eq "64"); $defined = 1; } @@ -155,7 +155,7 @@ if ($?) { $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); diff --git a/common.h b/common.h index e045e42b2..c6f7ea2fd 100644 --- a/common.h +++ b/common.h @@ -397,6 +397,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_sparc.h" #endif +#ifdef ARCH_MIPS +#include "common_mips.h" +#endif + #ifdef ARCH_MIPS64 #include "common_mips64.h" #endif diff --git a/common_mips.h b/common_mips.h new file mode 100644 index 000000000..ae126949a --- /dev/null +++ b/common_mips.h @@ -0,0 +1,109 @@ +/***************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_MIPS +#define COMMON_MIPS + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void INLINE blas_lock(volatile unsigned long *address){ + +} +#define BLAS_LOCK_DEFINED + +static inline unsigned int rpcc(void){ + unsigned long ret; + + __asm__ __volatile__(".set push \n" + "rdhwr %0, $30 \n" + ".set pop" : "=r"(ret) : : "memory"); + + return ret; +} +#define RPCC_DEFINED + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#define GET_IMAGE(res) + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_mips64.h b/common_mips64.h index f5c0ec7cf..6078bf35b 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -102,7 +102,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) || defined(LOONGSON3B) + // unsigned long long tmp; //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); //ret=tmp; @@ -111,17 +111,10 @@ static inline unsigned int rpcc(void){ "rdhwr %0, $2\n" ".set pop": "=r"(ret):: "memory"); -#else - __asm__ __volatile__(".set push \n" - ".set mips32r2\n" - "rdhwr %0, $30 \n" - ".set pop" : "=r"(ret) : : "memory"); -#endif return ret; } #define RPCC_DEFINED -#if defined(LOONGSON3A) || defined(LOONGSON3B) #ifndef NO_AFFINITY #define WHEREAMI static inline int WhereAmI(void){ @@ -134,7 +127,6 @@ static inline int WhereAmI(void){ } #endif -#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; diff --git a/cpuid_mips.c b/cpuid_mips.c index 22beff7fc..15c58959e 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 +#define CPU_P5600 1 static char *cpuname[] = { "UNKOWN", - "SICORTEX", - "LOONGSON3A", - "LOONGSON3B" + "P5600" }; int detect(void){ @@ -120,7 +116,7 @@ int detect(void){ if (strstr(p, "loongson3a")) return CPU_LOONGSON3A; }else{ - return CPU_SICORTEX; + return CPU_UNKNOWN; } } //Check model name for Loongson3 @@ -149,64 +145,40 @@ char *get_corename(void){ } void get_architecture(void){ - printf("MIPS64"); + printf("MIPS"); } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_P5600){ + printf("P5600"); }else{ - printf("SICORTEX"); + printf("UNKNOWN"); } } void get_subdirname(void){ - printf("mips64"); + printf("mips"); } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 32\n"); - printf("#define L2_SIZE 512488\n"); - printf("#define L2_LINESIZE 32\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + if(detect()==CPU_P5600){ + printf("#define P5600\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); - printf("#define L2_SIZE 512488\n"); + printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); - }else{ - printf("#define SICORTEX\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 32\n"); - printf("#define L2_SIZE 512488\n"); - printf("#define L2_LINESIZE 32\n"); - printf("#define DTB_DEFAULT_ENTRIES 32\n"); - printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); + }else{ + printf("#define UNKNOWN\n"); } } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_P5600) { + printf("p5600\n"); }else{ -#ifdef __mips64 - printf("mips64\n"); -#else - printf("mips32\n"); -#endif + printf("mips\n"); } } diff --git a/cpuid_mips64.c b/cpuid_mips64.c new file mode 100644 index 000000000..13f1517d5 --- /dev/null +++ b/cpuid_mips64.c @@ -0,0 +1,223 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3A 2 +#define CPU_LOONGSON3B 3 +#define CPU_I6400 4 + +static char *cpuname[] = { + "UNKOWN", + "SICORTEX", + "LOONGSON3A", + "LOONGSON3B", + "I6400" +}; + +int detect(void){ + +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("cpu", buffer, 3)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if(p != NULL){ + if (strstr(p, "Loongson-3A")){ + return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; + }else if (strstr(p, "Loongson-3")){ + infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("system type", buffer, 11)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if (strstr(p, "loongson3a")) + return CPU_LOONGSON3A; + }else{ + return CPU_SICORTEX; + } + } + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A")){ + return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; + } + } +#endif + return CPU_UNKNOWN; +} + +char *get_corename(void){ + return cpuname[detect()]; +} + +void get_architecture(void){ + printf("MIPS64"); +} + +void get_subarchitecture(void){ + if(detect()==CPU_LOONGSON3A) { + printf("LOONGSON3A"); + }else if(detect()==CPU_LOONGSON3B){ + printf("LOONGSON3B"); + }else if(detect()==CPU_I6400){ + printf("I6400"); + }else{ + printf("SICORTEX"); + } +} + +void get_subdirname(void){ + printf("mips64"); +} + +void get_cpuconfig(void){ + if(detect()==CPU_LOONGSON3A) { + printf("#define LOONGSON3A\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_LOONGSON3B){ + printf("#define LOONGSON3B\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_I6400){ + printf("#define I6400\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + }else{ + printf("#define SICORTEX\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 32\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + } +} + +void get_libname(void){ + if(detect()==CPU_LOONGSON3A) { + printf("loongson3a\n"); + }else if(detect()==CPU_LOONGSON3B) { + printf("loongson3b\n"); + }else if(detect()==CPU_I6400) { + printf("i6400\n"); + }else{ + printf("mips64\n"); + } +} diff --git a/ctest.c b/ctest.c index b5c74f137..e0ef46e60 100644 --- a/ctest.c +++ b/ctest.c @@ -110,7 +110,7 @@ ARCH_MIPS64 #endif #if defined(__mips32) || defined(__mips) -ARCH_MIPS32 +ARCH_MIPS #endif #ifdef __alpha diff --git a/getarch.c b/getarch.c index 1e0b08675..2f5d18a01 100644 --- a/getarch.c +++ b/getarch.c @@ -131,6 +131,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ /* #define FORCE_LOONGSON3B */ +/* #define FORCE_I6400 */ +/* #define FORCE_P5600 */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_SPARC */ /* #define FORCE_SPARCV7 */ @@ -699,6 +701,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_I6400 +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "I6400" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DI6400 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "i6400" +#define CORENAME "I6400" +#else +#endif + +#ifdef FORCE_P5600 +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "P5600" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-DP5600 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "p5600" +#define CORENAME "P5600" +#else +#endif + #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" @@ -888,7 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __mips__ +#ifdef __mips64 +#include "cpuid_mips64.c" +#else #include "cpuid_mips.c" +#endif #define OPENBLAS_SUPPORTED #endif diff --git a/kernel/mips/KERNEL b/kernel/mips/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/mips/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 new file mode 100644 index 000000000..09064fe48 --- /dev/null +++ b/kernel/mips/KERNEL.P5600 @@ -0,0 +1,130 @@ +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/mips/Makefile b/kernel/mips/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/mips/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/mips/amax.c b/kernel/mips/amax.c new file mode 100644 index 000000000..ad14081f5 --- /dev/null +++ b/kernel/mips/amax.c @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/mips/amin.c b/kernel/mips/amin.c new file mode 100644 index 000000000..8079450ff --- /dev/null +++ b/kernel/mips/amin.c @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/mips/asum.c b/kernel/mips/asum.c new file mode 100644 index 000000000..d221464de --- /dev/null +++ b/kernel/mips/asum.c @@ -0,0 +1,57 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/mips/axpby.c b/kernel/mips/axpby.c new file mode 100644 index 000000000..af4fccde2 --- /dev/null +++ b/kernel/mips/axpby.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + + ix = 0; + iy = 0; + + if ( beta == 0.0 ) + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = 0.0 ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + + + } + + } + else + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = beta * y[iy] ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] + beta * y[iy] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + + } + + return(0); + +} + + diff --git a/kernel/mips/axpy.c b/kernel/mips/axpy.c new file mode 100644 index 000000000..42f181ee1 --- /dev/null +++ b/kernel/mips/axpy.c @@ -0,0 +1,54 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/copy.c b/kernel/mips/copy.c new file mode 100644 index 000000000..9f488ddb3 --- /dev/null +++ b/kernel/mips/copy.c @@ -0,0 +1,50 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/dot.c b/kernel/mips/dot.c new file mode 100644 index 000000000..de7f7167f --- /dev/null +++ b/kernel/mips/dot.c @@ -0,0 +1,55 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/mips/gemv_n.c b/kernel/mips/gemv_n.c new file mode 100644 index 000000000..4cc177209 --- /dev/null +++ b/kernel/mips/gemv_n.c @@ -0,0 +1,56 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/mips/iamin.c b/kernel/mips/iamin.c new file mode 100644 index 000000000..7f1c4d905 --- /dev/null +++ b/kernel/mips/iamin.c @@ -0,0 +1,68 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/mips/imax.c b/kernel/mips/imax.c new file mode 100644 index 000000000..744bfc0d9 --- /dev/null +++ b/kernel/mips/imax.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c new file mode 100644 index 000000000..d9b283d2d --- /dev/null +++ b/kernel/mips/imin.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/mips/izamax.c b/kernel/mips/izamax.c new file mode 100644 index 000000000..708ee921d --- /dev/null +++ b/kernel/mips/izamax.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/mips/izamin.c b/kernel/mips/izamin.c new file mode 100644 index 000000000..523605ef4 --- /dev/null +++ b/kernel/mips/izamin.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/mips/max.c b/kernel/mips/max.c new file mode 100644 index 000000000..2ad956bc0 --- /dev/null +++ b/kernel/mips/max.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/mips/min.c b/kernel/mips/min.c new file mode 100644 index 000000000..2812fe397 --- /dev/null +++ b/kernel/mips/min.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c new file mode 100644 index 000000000..fcff09337 --- /dev/null +++ b/kernel/mips/nrm2.c @@ -0,0 +1,88 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n <= 0 || inc_x <= 0) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/mips/omatcopy_cn.c b/kernel/mips/omatcopy_cn.c new file mode 100644 index 000000000..11357ec93 --- /dev/null +++ b/kernel/mips/omatcopy_cn.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/symv_L.c b/kernel/mips/symv_L.c new file mode 100644 index 000000000..6a83d73f9 --- /dev/null +++ b/kernel/mips/symv_L.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if ( m != offset ) + printf("Symv_L: m=%d offset=%d\n",m,offset); +#endif + + jx = 0; + jy = 0; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(maxf); +} + + diff --git a/kernel/mips/zamin.c b/kernel/mips/zamin.c new file mode 100644 index 000000000..97c07da81 --- /dev/null +++ b/kernel/mips/zamin.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(minf); +} + + diff --git a/kernel/mips/zasum.c b/kernel/mips/zasum.c new file mode 100644 index 000000000..77a2ed685 --- /dev/null +++ b/kernel/mips/zasum.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/mips/zaxpby.c b/kernel/mips/zaxpby.c new file mode 100644 index 000000000..97452e942 --- /dev/null +++ b/kernel/mips/zaxpby.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix,iy; + FLOAT temp; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + y[iy] = 0.0 ; + y[iy+1] = 0.0 ; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + } + else + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + + + } + return(0); + +} + + diff --git a/kernel/mips/zaxpy.c b/kernel/mips/zaxpy.c new file mode 100644 index 000000000..f0fbab4a2 --- /dev/null +++ b/kernel/mips/zaxpy.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/zcopy.c b/kernel/mips/zcopy.c new file mode 100644 index 000000000..6bb6e33b6 --- /dev/null +++ b/kernel/mips/zcopy.c @@ -0,0 +1,56 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips/zdot.c b/kernel/mips/zdot.c new file mode 100644 index 000000000..da9ec7076 --- /dev/null +++ b/kernel/mips/zdot.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#ifndef _MSC_VER +#include +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} + + diff --git a/kernel/mips/zgemv_n.c b/kernel/mips/zgemv_n.c new file mode 100644 index 000000000..9bf1f6b42 --- /dev/null +++ b/kernel/mips/zgemv_n.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/mips/zomatcopy_cn.c b/kernel/mips/zomatcopy_cn.c new file mode 100644 index 000000000..bf6d3c70d --- /dev/null +++ b/kernel/mips/zomatcopy_cn.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j,ia; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + lda *= 2; + ldb *= 2; + + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/mips64/KERNEL.I6400 b/kernel/mips64/KERNEL.I6400 new file mode 100644 index 000000000..abf44814a --- /dev/null +++ b/kernel/mips64/KERNEL.I6400 @@ -0,0 +1 @@ +include $(KERNELDIR)/../mips/KERNEL.P5600 diff --git a/lapack/laswp/mips/Makefile b/lapack/laswp/mips/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/mips/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index 8ecc812dc..93b1220d6 100644 --- a/param.h +++ b/param.h @@ -2174,6 +2174,83 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined(I6400) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + +#if defined(P5600) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif #ifdef ARMV7 #define SNUMOPT 2 From 879a51165f952830fc9c27df326bcad70c4c7cb6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 22 Apr 2016 13:07:12 +0200 Subject: [PATCH 10/70] Optimized zgemm and tested zgemm again --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zgemm_kernel_8x2_power8.S | 72 ++- kernel/power/zgemm_logic_8x2_power8.S | 69 ++- kernel/power/zgemm_macros_8x2_power8.S | 108 +++++ kernel/power/zgemm_tcopy_8_power8.S | 205 ++++++++ kernel/power/zgemm_tcopy_logic_8_power8.S | 246 ++++++++++ kernel/power/zgemm_tcopy_macros_8_power8.S | 535 +++++++++++++++++++++ param.h | 2 +- 8 files changed, 1227 insertions(+), 12 deletions(-) create mode 100644 kernel/power/zgemm_tcopy_8_power8.S create mode 100644 kernel/power/zgemm_tcopy_logic_8_power8.S create mode 100644 kernel/power/zgemm_tcopy_macros_8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index e1b89cc97..c7df0e039 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 336b13b1f..02c94a88a 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,3 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -250,7 +320,7 @@ ble L999 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 384 + li PRE, 512 li o8 , 8 li o16 , 16 li o24 , 24 diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index 96612da82..0cd784cc0 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 1 ble ZGEMM_L2_END @@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN: mr BO, B mr BBO, BBUFFER - slwi T1, K, 1 + srawi. T1, K, 2 + ble ZGEMM_L2_COPYB1 -ZGEMM_L2_COPYB: +ZGEMM_L2_COPYB8: - lxvdsx vs4, o0, BO // b0_r - lxvdsx vs5, o8, BO // b0_i - addi BO, BO, 16 - stxvd2x vs4, o0, BBO - stxvd2x vs5, o16, BBO + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8x1 addic. T1, T1, -1 - addi BBO, BBO, 32 - bge ZGEMM_L2_COPYB + bgt ZGEMM_L2_COPYB8 + +ZGEMM_L2_COPYB1: + + andi. T1, K, 3 + ble ZGEMM_L2_COPYB_END + +ZGEMM_L2_COPYB_LOOP: + + ZCOPYB_1x1 + ZCOPYB_1x1 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB_LOOP +ZGEMM_L2_COPYB_END: mr CO, C mr AO, A @@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN: slwi T1, K, 0 ZGEMM_L1_COPYB: + dcbtst BBO, PRE lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index a0fbb2e11..c43a115b2 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp @@ -3055,3 +3090,76 @@ .endm + + +.macro ZCOPYB_1x1 + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addi BBO, BBO, 32 + +.endm + + +.macro ZCOPYB_8x1 + + lxvd2x vs32, o0, BO + lxvd2x vs33, o16, BO + lxvd2x vs34, o32, BO + lxvd2x vs35, o48, BO + addi BO, BO, 64 + + lxvd2x vs36, o0, BO + lxvd2x vs37, o16, BO + lxvd2x vs38, o32, BO + lxvd2x vs39, o48, BO + addi BO, BO, 64 + + xxspltd vs40, vs32, 0 + xxspltd vs41, vs32, 1 + xxspltd vs42, vs33, 0 + xxspltd vs43, vs33, 1 + xxspltd vs44, vs34, 0 + xxspltd vs45, vs34, 1 + xxspltd vs46, vs35, 0 + xxspltd vs47, vs35, 1 + + xxspltd vs48, vs36, 0 + xxspltd vs49, vs36, 1 + xxspltd vs50, vs37, 0 + xxspltd vs51, vs37, 1 + xxspltd vs52, vs38, 0 + xxspltd vs53, vs38, 1 + xxspltd vs54, vs39, 0 + xxspltd vs55, vs39, 1 + + stxvd2x vs40, o0, BBO + stxvd2x vs41, o16, BBO + stxvd2x vs42, o32, BBO + stxvd2x vs43, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs44, o0, BBO + stxvd2x vs45, o16, BBO + stxvd2x vs46, o32, BBO + stxvd2x vs47, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs48, o0, BBO + stxvd2x vs49, o16, BBO + stxvd2x vs50, o32, BBO + stxvd2x vs51, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs52, o0, BBO + stxvd2x vs53, o16, BBO + stxvd2x vs54, o32, BBO + stxvd2x vs55, o48, BBO + addi BBO, BBO, 64 + +.endm + + diff --git a/kernel/power/zgemm_tcopy_8_power8.S b/kernel/power/zgemm_tcopy_8_power8.S new file mode 100644 index 000000000..1f3f35419 --- /dev/null +++ b/kernel/power/zgemm_tcopy_8_power8.S @@ -0,0 +1,205 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define NOTUS1 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTUS2 r27 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "zgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, ZBASE_SHIFT + slwi M8, M, 3 + ZBASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, ZBASE_SHIFT + slwi B2, B2, ZBASE_SHIFT + slwi B1, B1, ZBASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o16, 16 + li o32, 32 + li o48, 48 + +#include "zgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/zgemm_tcopy_logic_8_power8.S b/kernel/power/zgemm_tcopy_logic_8_power8.S new file mode 100644 index 000000000..34fd307bd --- /dev/null +++ b/kernel/power/zgemm_tcopy_logic_8_power8.S @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble ZCOPYT_L2_BEGIN + + +ZCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 64*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L4x4_BEGIN + + mr BO, B8 + + .align 5 + +ZCOPYT_L4x8_LOOP: + + addi T1, PREB, 128 + addi T2, PREB, 256 + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M8 + dcbtst BO, PREB + dcbtst BO, T1 + dcbtst BO, T2 + + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L4x8_LOOP + +ZCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 32*SIZE + +ZCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 16*SIZE + +ZCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 8*SIZE + +ZCOPYT_L4_END: + + addic. I, I, -1 + bgt ZCOPYT_L4_BEGIN + + + +ZCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble ZCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L2x4_BEGIN + + mr BO, B8 + +ZCOPYT_L2x8_LOOP: + + COPY_2x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L2x8_LOOP + +ZCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 16*SIZE + +ZCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 8*SIZE + +ZCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 4*SIZE + +ZCOPYT_L2_END: + + +ZCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L1x4_BEGIN + + mr BO, B8 + +ZCOPYT_L1x8_LOOP: + + COPY_1x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L1x8_LOOP + +ZCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 8*SIZE + +ZCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 4*SIZE + +ZCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 2*SIZE + +ZCOPYT_L1_END: + diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S new file mode 100644 index 000000000..e8c2f0baa --- /dev/null +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs48, o0, A2 + lxvd2x vs49, o16, A2 + lxvd2x vs50, o32, A2 + lxvd2x vs51, o48, A2 + addi A2, A2, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs56, o0, A3 + lxvd2x vs57, o16, A3 + lxvd2x vs58, o32, A3 + lxvd2x vs59, o48, A3 + addi A3, A3, 64 + + lxvd2x vs60, o0, A3 + lxvd2x vs61, o16, A3 + lxvd2x vs62, o32, A3 + lxvd2x vs63, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 + addi T1, T1, 64 + + stxvd2x vs52, o0, T1 + stxvd2x vs53, o16, T1 + stxvd2x vs54, o32, T1 + stxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs56, o0, T1 + stxvd2x vs57, o16, T1 + stxvd2x vs58, o32, T1 + stxvd2x vs59, o48, T1 + addi T1, T1, 64 + + stxvd2x vs60, o0, T1 + stxvd2x vs61, o16, T1 + stxvd2x vs62, o32, T1 + stxvd2x vs63, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs40, o0, A2 + lxvd2x vs41, o16, A2 + lxvd2x vs42, o32, A2 + lxvd2x vs43, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs44, o0, A3 + lxvd2x vs45, o16, A3 + lxvd2x vs46, o32, A3 + lxvd2x vs47, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs36, o0, A2 + lxvd2x vs37, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs38, o0, A3 + lxvd2x vs39, o16, A3 + addi A3, A3, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs34, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs35, o0, A3 + addi A3, A3, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + +.endm + diff --git a/param.h b/param.h index 8ecc812dc..0a9f02fde 100644 --- a/param.h +++ b/param.h @@ -1985,7 +1985,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720 -#define ZGEMM_DEFAULT_Q 720 +#define ZGEMM_DEFAULT_Q 360 #define SYMV_P 8 From d46f07bb4e8281c4b213d2cf15163c5d0765ee9b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 23 Apr 2016 07:37:18 +0200 Subject: [PATCH 11/70] added cgemm_tcopy_8_power8.S --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/cgemm_tcopy_8_power8.S | 206 +++++++++++ kernel/power/cgemm_tcopy_logic_8_power8.S | 247 +++++++++++++ kernel/power/cgemm_tcopy_macros_8_power8.S | 385 +++++++++++++++++++++ 4 files changed, 839 insertions(+), 1 deletion(-) create mode 100644 kernel/power/cgemm_tcopy_8_power8.S create mode 100644 kernel/power/cgemm_tcopy_logic_8_power8.S create mode 100644 kernel/power/cgemm_tcopy_macros_8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c7df0e039..9406e7793 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -30,7 +30,7 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o diff --git a/kernel/power/cgemm_tcopy_8_power8.S b/kernel/power/cgemm_tcopy_8_power8.S new file mode 100644 index 000000000..b1a7d2b27 --- /dev/null +++ b/kernel/power/cgemm_tcopy_8_power8.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTUS2 r27 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "cgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, ZBASE_SHIFT + slwi M8, M, 3 + ZBASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, ZBASE_SHIFT + slwi B2, B2, ZBASE_SHIFT + slwi B1, B1, ZBASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "cgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/cgemm_tcopy_logic_8_power8.S b/kernel/power/cgemm_tcopy_logic_8_power8.S new file mode 100644 index 000000000..9418908b7 --- /dev/null +++ b/kernel/power/cgemm_tcopy_logic_8_power8.S @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble CCOPYT_L2_BEGIN + + +CCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 64*SIZE + + sradi. J, N, 3 + ble CCOPYT_L4x4_BEGIN + + mr BO, B8 + +CCOPYT_L4x8_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M8 + dcbtst BO, PREB + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + ble CCOPYT_L4x4_BEGIN + + + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L4x8_LOOP + +CCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 32*SIZE + +CCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 16*SIZE + +CCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 8*SIZE + +CCOPYT_L4_END: + + addic. I, I, -1 + bgt CCOPYT_L4_BEGIN + + + +CCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble CCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble CCOPYT_L2x4_BEGIN + + mr BO, B8 + +CCOPYT_L2x8_LOOP: + + COPY_2x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L2x8_LOOP + +CCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 16*SIZE + +CCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 8*SIZE + +CCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 4*SIZE + +CCOPYT_L2_END: + + +CCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble CCOPYT_L1x4_BEGIN + + mr BO, B8 + +CCOPYT_L1x8_LOOP: + + COPY_1x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L1x8_LOOP + +CCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 8*SIZE + +CCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 4*SIZE + +CCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 2*SIZE + +CCOPYT_L1_END: + diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S new file mode 100644 index 000000000..03fda2766 --- /dev/null +++ b/kernel/power/cgemm_tcopy_macros_8_power8.S @@ -0,0 +1,385 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + + addi A0, A0, 64 + addi A1, A1, 64 + + lxvw4x vs40, o0, A2 + lxvw4x vs41, o16, A2 + lxvw4x vs42, o32, A2 + lxvw4x vs43, o48, A2 + + lxvw4x vs44, o0, A3 + lxvw4x vs45, o16, A3 + lxvw4x vs46, o32, A3 + lxvw4x vs47, o48, A3 + + mr T1, BO + addi A2, A2, 64 + addi A3, A3, 64 + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs40, o0, T1 + stxvw4x vs41, o16, T1 + stxvw4x vs42, o32, T1 + stxvw4x vs43, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs44, o0, T1 + stxvw4x vs45, o16, T1 + stxvw4x vs46, o32, T1 + stxvw4x vs47, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + addi A1, A1, 32 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + addi A2, A2, 32 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + addi A3, A3, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + lxvw4x vs33, o0, A1 + addi A1, A1, 16 + + lxvw4x vs34, o0, A2 + addi A2, A2, 16 + + lxvw4x vs35, o0, A3 + addi A3, A3, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + addi A1, A1, 8 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + addi A2, A2, 8 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + addi A3, A3, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + addi A0, A0, 64 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + addi A1, A1, 64 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + addi A1, A1, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + lxvw4x vs33, o0, A1 + addi A1, A1, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + addi A1, A1, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + addi A0, A0, 64 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + From a3da10662ffa8697e33151c2ab5665c1e6954da2 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 23 Apr 2016 10:04:41 +0200 Subject: [PATCH 12/70] added sgemm_tcopy_8_power8.S --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/sgemm_tcopy_8_power8.S | 207 ++++++++++++++ kernel/power/sgemm_tcopy_logic_8_power8.S | 299 ++++++++++++++++++++ kernel/power/sgemm_tcopy_macros_8_power8.S | 308 +++++++++++++++++++++ 4 files changed, 815 insertions(+), 1 deletion(-) create mode 100644 kernel/power/sgemm_tcopy_8_power8.S create mode 100644 kernel/power/sgemm_tcopy_logic_8_power8.S create mode 100644 kernel/power/sgemm_tcopy_macros_8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 9406e7793..0b6a7f3b8 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o diff --git a/kernel/power/sgemm_tcopy_8_power8.S b/kernel/power/sgemm_tcopy_8_power8.S new file mode 100644 index 000000000..2bbd6e696 --- /dev/null +++ b/kernel/power/sgemm_tcopy_8_power8.S @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTU1 r29 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "sgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + slwi M8, M, 3 + BASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, BASE_SHIFT + slwi B2, B2, BASE_SHIFT + slwi B1, B1, BASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "sgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/sgemm_tcopy_logic_8_power8.S b/kernel/power/sgemm_tcopy_logic_8_power8.S new file mode 100644 index 000000000..4cf74baa3 --- /dev/null +++ b/kernel/power/sgemm_tcopy_logic_8_power8.S @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble SCOPYOT_L2_BEGIN + + +SCOPYOT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L4x4_BEGIN + + mr BO, B8 + .align 5 + +SCOPYOT_L4x8_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L4x8_LOOP + +SCOPYOT_L4x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + addi A2, A2, 4*SIZE + addi A3, A3, 4*SIZE + + addi B4, B4, 16*SIZE + +SCOPYOT_L4x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + addi A2, A2, 2*SIZE + addi A3, A3, 2*SIZE + + addi B2, B2, 8*SIZE + +SCOPYOT_L4x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L4_END + + mr BO, B1 + + COPY_4x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + addi A2, A2, 1*SIZE + addi A3, A3, 1*SIZE + + addi B1, B1, 4*SIZE + +SCOPYOT_L4_END: + + addic. I, I, -1 + bgt SCOPYOT_L4_BEGIN + + + +SCOPYOT_L2_BEGIN: + + andi. T1, M, 2 + ble SCOPYOT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L2x4_BEGIN + + mr BO, B8 + +SCOPYOT_L2x8_LOOP: + + COPY_2x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L2x8_LOOP + +SCOPYOT_L2x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + + addi B4, B4, 8*SIZE + +SCOPYOT_L2x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + + addi B2, B2, 4*SIZE + +SCOPYOT_L2x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L2_END + + mr BO, B1 + + COPY_2x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + + addi B1, B1, 2*SIZE + +SCOPYOT_L2_END: + + +SCOPYOT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 8*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L1x4_BEGIN + + mr BO, B8 + +SCOPYOT_L1x8_LOOP: + + COPY_1x8 + + addi A0, A0, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L1x8_LOOP + +SCOPYOT_L1x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + addi A0, A0, 4*SIZE + + addi B4, B4, 4*SIZE + +SCOPYOT_L1x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + addi A0, A0, 2*SIZE + + addi B2, B2, 2*SIZE + +SCOPYOT_L1x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L1_END + + mr BO, B1 + + COPY_1x1 + + addi A0, A0, 1*SIZE + + addi B1, B1, 1*SIZE + +SCOPYOT_L1_END: + diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S new file mode 100644 index 000000000..1b71d5bb3 --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_8_power8.S @@ -0,0 +1,308 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + lxvw4x vs34, o0, A2 + + lxvw4x vs35, o0, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + lxsspx vs34, o0, A2 + + lxsspx vs35, o0, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + +.endm + From 089aad57f7c040539a0f5ce29cebe759498a5f54 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 23 Apr 2016 14:26:24 +0200 Subject: [PATCH 13/70] updated param.h for POWER8 --- param.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/param.h b/param.h index 0a9f02fde..b1bce23a0 100644 --- a/param.h +++ b/param.h @@ -1977,15 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 960 -#define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 720 -#define ZGEMM_DEFAULT_P 480 - -#define SGEMM_DEFAULT_Q 720 -#define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 720 -#define ZGEMM_DEFAULT_Q 360 +#define SGEMM_DEFAULT_P 1280 +#define DGEMM_DEFAULT_P 640 +#define CGEMM_DEFAULT_P 640 +#define ZGEMM_DEFAULT_P 320 + +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 640 +#define CGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 640 #define SYMV_P 8 From 2b967590a0523c00b1bcef3153ba9d8615003e9e Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 25 Apr 2016 09:08:38 +0200 Subject: [PATCH 14/70] bugfix in dynamic.c --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2fde07fcc..9e8cce438 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 21; i++) + for ( i=1 ; i <= 22; i++) { if (!strncasecmp(coretype,corename[i],20)) { From 78b05f647667346037c3191d9d23dc7bd171e0b4 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 25 Apr 2016 10:13:30 +0200 Subject: [PATCH 15/70] bugfix for EXCAVATOR and DYNAMIC_ARCH --- driver/others/parameter.c | 4 ++-- kernel/setparam-ref.c | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index f4b1a80ad..f22c6b69a 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -167,7 +167,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) int size = 16; #else int size = get_L2_size(); diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index a4d1486fc..ba44b8f61 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -933,6 +933,23 @@ static void init_parameter(void) { #endif #endif +#ifdef EXCAVATOR + +#ifdef DEBUG + fprintf(stderr, "Excavator\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + #ifdef PILEDRIVER #ifdef DEBUG From 298b13bba4c1b3d7729cecc1d08a3d3033a8f86d Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 25 Apr 2016 10:36:23 +0200 Subject: [PATCH 16/70] updated some kernel files for EXCAVATOR --- kernel/x86_64/KERNEL.EXCAVATOR | 36 ++++++++++++++++++---------------- kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemv_n_4.c | 4 ++-- kernel/x86_64/sgemv_t_4.c | 4 ++-- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- 24 files changed, 44 insertions(+), 42 deletions(-) diff --git a/kernel/x86_64/KERNEL.EXCAVATOR b/kernel/x86_64/KERNEL.EXCAVATOR index dbdd1fe9b..4ec748284 100644 --- a/kernel/x86_64/KERNEL.EXCAVATOR +++ b/kernel/x86_64/KERNEL.EXCAVATOR @@ -1,3 +1,7 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c @@ -20,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S @@ -68,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - +STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c +STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c +STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c +STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c +CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c +CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c +CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c +ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c +ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c +ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c +ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 1ee0499a7..5af9b8fcc 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 2b2c4ff7a..9bba72ba2 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "cdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" #elif defined(HASWELL) #include "cdot_microk_haswell-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index d60e4475d..235510534 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cgemv_n_microk_haswell-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index b558164ff..1a714f61f 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cgemv_t_microk_haswell-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 5d86b1929..c44d12e3d 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "cscal_microk_steamroller-2.c" #elif defined(SANDYBRIDGE) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 56d323cbe..18569e6e4 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_nehalem-2.c" #elif defined(BULLDOZER) #include "daxpy_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 4bf8082c9..a45dd7f3b 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "ddot_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "ddot_microk_piledriver-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 485b234b0..4200b8acd 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(STEAMROLLER) +#elif defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 8ed821dd0..42f11f39a 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(STEAMROLLER) +#if defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index b7110e6ac..bbc1c9660 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 3f5e77e5f..e10784ad7 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) #include "dsymv_L_microk_haswell-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 9f5ae3015..bd07ce2c3 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" #elif defined(HASWELL) #include "dsymv_U_microk_haswell-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 0b76c42f7..b9e5d5784 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" -#elif defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "saxpy_microk_piledriver-2.c" #endif diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index a3d20d276..d9fc417a0 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "sdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index c7b4516c3..bdf68dd07 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "sgemv_n_microk_bulldozer-4.c" #elif defined(NEHALEM) #include "sgemv_n_microk_nehalem-4.c" @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_haswell-4.c" #endif -#if defined(STEAMROLLER) +#if defined(STEAMROLLER) || defined(EXCAVATOR) #define NBMAX 2048 #else #define NBMAX 4096 diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 5c7d1a53b..62550e65c 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_haswell-4.c" #endif -#if defined(STEAMROLLER) +#if defined(STEAMROLLER) || defined(EXCAVATOR) #define NBMAX 2048 #else #define NBMAX 4096 diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 0997f108d..3813981ed 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index ed1e8236c..e4d3c9b30 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 560acc7f9..0cd555a68 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "zaxpy_microk_bulldozer-2.c" -#elif defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" #elif defined(HASWELL) #include "zaxpy_microk_haswell-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index eee00fd9f..4533d4e88 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" #elif defined(HASWELL) #include "zdot_microk_haswell-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 63e49f2af..4171fc99f 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_n_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 4abb2d5ad..0524c71f7 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) #include "zgemv_t_microk_haswell-4.c" diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index a96766032..7ca8774b7 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "zscal_microk_steamroller-2.c" #endif From 40ac64ae4f76f31dc03e26a57f2bf03ac098b087 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 25 Apr 2016 10:40:04 +0200 Subject: [PATCH 17/70] updated param.h for EXCAVATOR --- param.h | 95 +-------------------------------------------------------- 1 file changed, 1 insertion(+), 94 deletions(-) diff --git a/param.h b/param.h index b1bce23a0..aa09f6d61 100644 --- a/param.h +++ b/param.h @@ -410,100 +410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef STEAMROLLER -#define SNUMOPT 8 -#define DNUMOPT 4 - -#define GEMM_DEFAULT_OFFSET_A 64 -#define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL - - - -#define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 -#define XGEMM_DEFAULT_UNROLL_N 1 - -#ifdef ARCH_X86 -#define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 1 -#define XGEMM_DEFAULT_UNROLL_M 1 -#else -#define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define XGEMM_DEFAULT_UNROLL_M 1 -#define CGEMM3M_DEFAULT_UNROLL_N 4 -#define CGEMM3M_DEFAULT_UNROLL_M 8 -#define ZGEMM3M_DEFAULT_UNROLL_N 4 -#define ZGEMM3M_DEFAULT_UNROLL_M 4 -#define GEMV_UNROLL 8 -#endif - -#if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 576 -#define ZGEMM_DEFAULT_P 288 -#define CGEMM_DEFAULT_P 576 -#else -#define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 480 -#define ZGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#endif -#define QGEMM_DEFAULT_P 112 -#define XGEMM_DEFAULT_P 56 - -#if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 160 -#define ZGEMM_DEFAULT_Q 160 -#define CGEMM_DEFAULT_Q 160 -#else -#define SGEMM_DEFAULT_Q 224 -#define DGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#endif -#define QGEMM_DEFAULT_Q 224 -#define XGEMM_DEFAULT_Q 224 - -#define CGEMM3M_DEFAULT_P 448 -#define ZGEMM3M_DEFAULT_P 224 -#define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 -#define ZGEMM3M_DEFAULT_Q 224 -#define XGEMM3M_DEFAULT_Q 224 -#define CGEMM3M_DEFAULT_R 12288 -#define ZGEMM3M_DEFAULT_R 12288 -#define XGEMM3M_DEFAULT_R 12288 - -#define SGEMM_DEFAULT_R 12288 -#define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R 12288 -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r -#define XGEMM_DEFAULT_R xgemm_r - -#define SYMV_P 16 -#define HAVE_EXCLUSIVE_CACHE - -#define GEMM_THREAD gemm_thread_mn - -#endif - - -#ifdef EXCAVATOR +#if defined(STEAMROLLER) || defined(EXCAVATOR) #define SNUMOPT 8 #define DNUMOPT 4 From 3d50ccdc0de9b0756d4bdc63e196403332d68570 Mon Sep 17 00:00:00 2001 From: Aleksey Kuleshov Date: Fri, 22 Apr 2016 18:21:18 +0300 Subject: [PATCH 18/70] allow building tests when CROSS compiling but don't run them --- Makefile | 4 ---- ctest/Makefile | 6 ++++++ test/Makefile | 8 ++++++++ utest/Makefile | 2 ++ 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 9ba2bffb3..2ae004798 100644 --- a/Makefile +++ b/Makefile @@ -108,8 +108,6 @@ endif tests : ifndef NOFORTRAN -ifndef TARGET -ifndef CROSS touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -119,8 +117,6 @@ ifndef NO_CBLAS $(MAKE) -C ctest all endif endif -endif -endif libs : ifeq ($(CORE), UNKOWN) diff --git a/ctest/Makefile b/ctest/Makefile index 7a5d236aa..6eda43863 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o all :: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat1 OMP_NUM_THREADS=2 ./xdcblat1 @@ -53,8 +54,10 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat1 OPENBLAS_NUM_THREADS=2 ./xzcblat1 endif +endif all2: xscblat2 xdcblat2 xccblat2 xzcblat2 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 OMP_NUM_THREADS=2 ./xdcblat2 < din2 @@ -66,8 +69,10 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 endif +endif all3: xscblat3 xdcblat3 xccblat3 xzcblat3 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 OMP_NUM_THREADS=2 ./xdcblat3 < din3 @@ -88,6 +93,7 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif +endif diff --git a/test/Makefile b/test/Makefile index 75ea6de60..65fb6f438 100644 --- a/test/Makefile +++ b/test/Makefile @@ -4,6 +4,7 @@ include ../Makefile.system all :: level1 level2 level3 level1 : sblat1 dblat1 cblat1 zblat1 +ifndef CROSS OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 @@ -21,8 +22,10 @@ else OPENBLAS_NUM_THREADS=2 ./zblat1 endif endif +endif level2 : sblat2 dblat2 cblat2 zblat2 +ifndef CROSS rm -f ?BLAT2.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 @@ -54,8 +57,10 @@ else @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif endif +endif level3 : sblat3 dblat3 cblat3 zblat3 +ifndef CROSS rm -f ?BLAT3.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 @@ -87,9 +92,11 @@ else @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif endif +endif level3_3m : zblat3_3m cblat3_3m +ifndef CROSS rm -f ?BLAT3_3M.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 @@ -109,6 +116,7 @@ else @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 endif endif +endif diff --git a/utest/Makefile b/utest/Makefile index 9f9808920..3ccc0a041 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) run_test: $(UTESTBIN) +ifndef CROSS ./$(UTESTBIN) +endif clean: -rm -f *.o $(UTESTBIN) From b5e98e4dda80367ba25bcea9b3283bde4719cd11 Mon Sep 17 00:00:00 2001 From: buffer51 Date: Tue, 26 Apr 2016 03:14:03 -0700 Subject: [PATCH 19/70] Added Android as a community-supported OS --- CONTRIBUTORS.md | 4 ++++ README.md | 1 + 2 files changed, 5 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ebe52ea8a..e9d97b300 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -150,3 +150,7 @@ In chronological order: * theoractice * [2016-03-20] Fix compiler error in VisualStudio with CMake * [2016-03-22] Fix access violation on Windows while static linking + +* Paul Mustière + * [2016-02-04] Fix Android build on ARMV7 + * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8 diff --git a/README.md b/README.md index 32a861081..8ac88840a 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ Please read GotoBLAS_01Readme.txt - **MingWin or Visual Studio(CMake)/Windows**: Please read . - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **FreeBSD**: Supported by community. We didn't test the library on this OS. +- **Android**: Supported by community. Please read . ## Usages Link with libopenblas.a or -lopenblas for shared library. From 0d0c6f7d7d4b918e9dd833e2c39eb7668ff7010c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 27 Apr 2016 14:01:08 +0200 Subject: [PATCH 20/70] optimized dgemm for POWER8 --- kernel/power/KERNEL.POWER8 | 12 +- kernel/power/dgemm_kernel_16x4_power8.S | 5 +- kernel/power/dgemm_logic_16x4_power8.S | 762 +++-- kernel/power/dgemm_macros_16x4_power8.S | 171 +- kernel/power/dgemm_tcopy_16_power8.S | 2 +- kernel/power/dgemm_tcopy_logic_16_power8.S | 4 + kernel/power/dtrmm_kernel_16x4_power8.S | 2 +- kernel/power/dtrmm_macros_16x4_power8.S | 3431 ++++++++++++++++++++ param.h | 99 +- 9 files changed, 3996 insertions(+), 492 deletions(-) create mode 100644 kernel/power/dtrmm_macros_16x4_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 0b6a7f3b8..fb07ccffd 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -21,12 +21,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = gemm_ncopy_4.S -DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 4c14b0c6f..bcc6ce328 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#define T4 r12 +#define T3 r11 + #define o8 r15 #define o24 r16 #define ALPHA r17 @@ -265,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi ALPHA, SP, 224 #endif - li PRE, 256 + li PRE, 384 li o8 , 8 li o16, 16 li o24, 24 diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 49c438f61..4ad3387e8 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -35,160 +35,154 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. srawi. J, N, 2 - ble .LDGEMM_L4_END + ble LDGEMM_L4_END -.LDGEMM_L4_BEGIN: +LDGEMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 - ble .LDGEMM_L4x16_END + ble LDGEMM_L4x16_END -.LDGEMM_L4x16_BEGIN: + .align 5 +LDGEMM_L4x16_BEGIN: + + li T4, -128 + + and T1, CO, T4 + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + andi. cr0, CO, 127 + ble LDGEMM_L4x16_BEGIN_NOPRE + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + +LDGEMM_L4x16_BEGIN_NOPRE: mr BO, B - srawi. L, K, 3 - ble .LDGEMM_L4x16_SUB0 + srawi. L, K, 2 + ble LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x16_SUB4 + ble LDGEMM_L4x16_SUB4 -.LDGEMM_L4x16_LOOP_START: + .align 5 +LDGEMM_L4x16_LOOP_START: - dcbt AO, PRE + dcbt AO, PRE LOAD4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_I1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 addic. L, L, -2 - ble .LDGEMM_L4x16_LOOP_END + ble LDGEMM_L4x16_LOOP_END - .align 5 + .align 7 -.LDGEMM_L4x16_LOOP: - - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 +LDGEMM_L4x16_LOOP: - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 addic. L, L, -1 - bgt .LDGEMM_L4x16_LOOP + bgt LDGEMM_L4x16_LOOP -.LDGEMM_L4x16_LOOP_END: + .align 5 +LDGEMM_L4x16_LOOP_END: - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 - - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE KERNEL4x16_1 KERNEL4x16_E2 - b .LDGEMM_L4x16_SUB1 + b LDGEMM_L4x16_SUB1 -.LDGEMM_L4x16_SUB4: +LDGEMM_L4x16_SUB4: - dcbt AO, PRE KERNEL4x16_SUBI1 - dcbt AO, PRE KERNEL4x16_SUB1 - dcbt AO, PRE KERNEL4x16_SUB1 - dcbt AO, PRE KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 + b LDGEMM_L4x16_SUB1 - b .LDGEMM_L4x16_SUB1 +LDGEMM_L4x16_SUB0: -.LDGEMM_L4x16_SUB0: - - andi. L, K, 7 + andi. L, K, 3 KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x16_SAVE - b .LDGEMM_L4x16_SUB2 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 -.LDGEMM_L4x16_SUB1: +LDGEMM_L4x16_SUB1: - andi. L, K, 7 - ble .LDGEMM_L4x16_SAVE + andi. L, K, 3 + ble LDGEMM_L4x16_SAVE -.LDGEMM_L4x16_SUB2: +LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x16_SUB2 + bgt LDGEMM_L4x16_SUB2 -.LDGEMM_L4x16_SAVE: + .align 5 +LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt .LDGEMM_L4x16_BEGIN + bgt LDGEMM_L4x16_BEGIN -.LDGEMM_L4x16_END: +LDGEMM_L4x16_END: -.LDGEMM_L4x8_BEGIN: +LDGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END andi. T1, M, 8 - ble .LDGEMM_L4x8_END + ble LDGEMM_L4x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x8_SUB0 + ble LDGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x8_SUB4 + ble LDGEMM_L4x8_SUB4 -.LDGEMM_L4x8_LOOP_START: +LDGEMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -202,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LDGEMM_L4x8_LOOP_END + ble LDGEMM_L4x8_LOOP_END .align 5 -.LDGEMM_L4x8_LOOP: +LDGEMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -219,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LDGEMM_L4x8_LOOP + bgt LDGEMM_L4x8_LOOP -.LDGEMM_L4x8_LOOP_END: +LDGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -233,9 +227,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB4: +LDGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -247,48 +241,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB0: +LDGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x8_SAVE - b .LDGEMM_L4x8_SUB2 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SUB1: +LDGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x8_SAVE + ble LDGEMM_L4x8_SAVE -.LDGEMM_L4x8_SUB2: +LDGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x8_SUB2 + bgt LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SAVE: +LDGEMM_L4x8_SAVE: SAVE4x8 -.LDGEMM_L4x8_END: +LDGEMM_L4x8_END: -.LDGEMM_L4x4_BEGIN: +LDGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L4x4_END + ble LDGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x4_SUB0 + ble LDGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x4_SUB4 + ble LDGEMM_L4x4_SUB4 -.LDGEMM_L4x4_LOOP_START: +LDGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -302,11 +296,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LDGEMM_L4x4_LOOP_END + ble LDGEMM_L4x4_LOOP_END .align 5 -.LDGEMM_L4x4_LOOP: +LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -319,9 +313,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LDGEMM_L4x4_LOOP + bgt LDGEMM_L4x4_LOOP -.LDGEMM_L4x4_LOOP_END: +LDGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -333,9 +327,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB4: +LDGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -347,48 +341,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB0: +LDGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x4_SAVE - b .LDGEMM_L4x4_SUB2 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SUB1: +LDGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x4_SAVE + ble LDGEMM_L4x4_SAVE -.LDGEMM_L4x4_SUB2: +LDGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x4_SUB2 + bgt LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SAVE: +LDGEMM_L4x4_SAVE: SAVE4x4 -.LDGEMM_L4x4_END: +LDGEMM_L4x4_END: -.LDGEMM_L4x2_BEGIN: +LDGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L4x2_END + ble LDGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x2_SUB0 + ble LDGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x2_SUB4 + ble LDGEMM_L4x2_SUB4 -.LDGEMM_L4x2_LOOP_START: +LDGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -402,11 +396,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LDGEMM_L4x2_LOOP_END + ble LDGEMM_L4x2_LOOP_END .align 5 -.LDGEMM_L4x2_LOOP: +LDGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -419,9 +413,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LDGEMM_L4x2_LOOP + bgt LDGEMM_L4x2_LOOP -.LDGEMM_L4x2_LOOP_END: +LDGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -433,9 +427,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB4: +LDGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -447,48 +441,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB0: +LDGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x2_SAVE - b .LDGEMM_L4x2_SUB2 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SUB1: +LDGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x2_SAVE + ble LDGEMM_L4x2_SAVE -.LDGEMM_L4x2_SUB2: +LDGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x2_SUB2 + bgt LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SAVE: +LDGEMM_L4x2_SAVE: SAVE4x2 -.LDGEMM_L4x2_END: +LDGEMM_L4x2_END: -.LDGEMM_L4x1_BEGIN: +LDGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x1_SUB0 + ble LDGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x1_SUB4 + ble LDGEMM_L4x1_SUB4 -.LDGEMM_L4x1_LOOP_START: +LDGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -502,11 +496,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LDGEMM_L4x1_LOOP_END + ble LDGEMM_L4x1_LOOP_END .align 5 -.LDGEMM_L4x1_LOOP: +LDGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -519,9 +513,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LDGEMM_L4x1_LOOP + bgt LDGEMM_L4x1_LOOP -.LDGEMM_L4x1_LOOP_END: +LDGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -533,9 +527,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB4: +LDGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -547,74 +541,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB0: +LDGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x1_SAVE - b .LDGEMM_L4x1_SUB2 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SUB1: +LDGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x1_SAVE + ble LDGEMM_L4x1_SAVE -.LDGEMM_L4x1_SUB2: +LDGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x1_SUB2 + bgt LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SAVE: +LDGEMM_L4x1_SAVE: SAVE4x1 -.LDGEMM_L4x1_END: +LDGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LDGEMM_L4_BEGIN + bgt LDGEMM_L4_BEGIN andi. T2, N, 3 ble .L999 -.LDGEMM_L4_END: +LDGEMM_L4_END: - b .LDGEMM_L2_BEGIN + b LDGEMM_L2_BEGIN .L999_H1: b .L999 -.LDGEMM_L2_BEGIN: +LDGEMM_L2_BEGIN: andi. T1, N, 2 - ble .LDGEMM_L2_END + ble LDGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble .LDGEMM_L2x16_END + ble LDGEMM_L2x16_END -.LDGEMM_L2x16_BEGIN: +LDGEMM_L2x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x16_SUB0 + ble LDGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x16_SUB4 + ble LDGEMM_L2x16_SUB4 -.LDGEMM_L2x16_LOOP_START: +LDGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -637,11 +631,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LDGEMM_L2x16_LOOP_END + ble LDGEMM_L2x16_LOOP_END .align 5 -.LDGEMM_L2x16_LOOP: +LDGEMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -662,9 +656,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LDGEMM_L2x16_LOOP + bgt LDGEMM_L2x16_LOOP -.LDGEMM_L2x16_LOOP_END: +LDGEMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -683,9 +677,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB4: +LDGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -701,53 +695,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB0: +LDGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x16_SAVE - b .LDGEMM_L2x16_SUB2 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SUB1: +LDGEMM_L2x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x16_SAVE + ble LDGEMM_L2x16_SAVE -.LDGEMM_L2x16_SUB2: +LDGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x16_SUB2 + bgt LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SAVE: +LDGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt .LDGEMM_L2x16_BEGIN + bgt LDGEMM_L2x16_BEGIN -.LDGEMM_L2x16_END: +LDGEMM_L2x16_END: -.LDGEMM_L2x8_BEGIN: +LDGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END andi. T1, M, 8 - ble .LDGEMM_L2x8_END + ble LDGEMM_L2x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x8_SUB0 + ble LDGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x8_SUB4 + ble LDGEMM_L2x8_SUB4 -.LDGEMM_L2x8_LOOP_START: +LDGEMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -761,11 +755,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LDGEMM_L2x8_LOOP_END + ble LDGEMM_L2x8_LOOP_END .align 5 -.LDGEMM_L2x8_LOOP: +LDGEMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -778,9 +772,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LDGEMM_L2x8_LOOP + bgt LDGEMM_L2x8_LOOP -.LDGEMM_L2x8_LOOP_END: +LDGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -792,9 +786,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB4: +LDGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -806,48 +800,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB0: +LDGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x8_SAVE - b .LDGEMM_L2x8_SUB2 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SUB1: +LDGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x8_SAVE + ble LDGEMM_L2x8_SAVE -.LDGEMM_L2x8_SUB2: +LDGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x8_SUB2 + bgt LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SAVE: +LDGEMM_L2x8_SAVE: SAVE2x8 -.LDGEMM_L2x8_END: +LDGEMM_L2x8_END: -.LDGEMM_L2x4_BEGIN: +LDGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L2x4_END + ble LDGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x4_SUB0 + ble LDGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x4_SUB4 + ble LDGEMM_L2x4_SUB4 -.LDGEMM_L2x4_LOOP_START: +LDGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -861,11 +855,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LDGEMM_L2x4_LOOP_END + ble LDGEMM_L2x4_LOOP_END .align 5 -.LDGEMM_L2x4_LOOP: +LDGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -878,9 +872,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LDGEMM_L2x4_LOOP + bgt LDGEMM_L2x4_LOOP -.LDGEMM_L2x4_LOOP_END: +LDGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -892,9 +886,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB4: +LDGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -906,48 +900,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB0: +LDGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x4_SAVE - b .LDGEMM_L2x4_SUB2 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SUB1: +LDGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x4_SAVE + ble LDGEMM_L2x4_SAVE -.LDGEMM_L2x4_SUB2: +LDGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x4_SUB2 + bgt LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SAVE: +LDGEMM_L2x4_SAVE: SAVE2x4 -.LDGEMM_L2x4_END: +LDGEMM_L2x4_END: -.LDGEMM_L2x2_BEGIN: +LDGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L2x2_END + ble LDGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x2_SUB0 + ble LDGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x2_SUB4 + ble LDGEMM_L2x2_SUB4 -.LDGEMM_L2x2_LOOP_START: +LDGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -961,11 +955,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LDGEMM_L2x2_LOOP_END + ble LDGEMM_L2x2_LOOP_END .align 5 -.LDGEMM_L2x2_LOOP: +LDGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -978,9 +972,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LDGEMM_L2x2_LOOP + bgt LDGEMM_L2x2_LOOP -.LDGEMM_L2x2_LOOP_END: +LDGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -992,9 +986,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB4: +LDGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1006,48 +1000,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB0: +LDGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x2_SAVE - b .LDGEMM_L2x2_SUB2 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SUB1: +LDGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x2_SAVE + ble LDGEMM_L2x2_SAVE -.LDGEMM_L2x2_SUB2: +LDGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x2_SUB2 + bgt LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SAVE: +LDGEMM_L2x2_SAVE: SAVE2x2 -.LDGEMM_L2x2_END: +LDGEMM_L2x2_END: -.LDGEMM_L2x1_BEGIN: +LDGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x1_SUB0 + ble LDGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x1_SUB4 + ble LDGEMM_L2x1_SUB4 -.LDGEMM_L2x1_LOOP_START: +LDGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1061,11 +1055,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LDGEMM_L2x1_LOOP_END + ble LDGEMM_L2x1_LOOP_END .align 5 -.LDGEMM_L2x1_LOOP: +LDGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1078,9 +1072,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LDGEMM_L2x1_LOOP + bgt LDGEMM_L2x1_LOOP -.LDGEMM_L2x1_LOOP_END: +LDGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1092,9 +1086,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB4: +LDGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1106,59 +1100,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB0: +LDGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x1_SAVE - b .LDGEMM_L2x1_SUB2 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SUB1: +LDGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x1_SAVE + ble LDGEMM_L2x1_SAVE -.LDGEMM_L2x1_SUB2: +LDGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x1_SUB2 + bgt LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SAVE: +LDGEMM_L2x1_SAVE: SAVE2x1 -.LDGEMM_L2x1_END: +LDGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -.LDGEMM_L2_END: -.LDGEMM_L1_BEGIN: +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LDGEMM_L1_END + ble LDGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble .LDGEMM_L1x16_END + ble LDGEMM_L1x16_END -.LDGEMM_L1x16_BEGIN: +LDGEMM_L1x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x16_SUB0 + ble LDGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x16_SUB4 + ble LDGEMM_L1x16_SUB4 -.LDGEMM_L1x16_LOOP_START: +LDGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1181,11 +1175,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LDGEMM_L1x16_LOOP_END + ble LDGEMM_L1x16_LOOP_END .align 5 -.LDGEMM_L1x16_LOOP: +LDGEMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1206,9 +1200,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LDGEMM_L1x16_LOOP + bgt LDGEMM_L1x16_LOOP -.LDGEMM_L1x16_LOOP_END: +LDGEMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1227,9 +1221,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB4: +LDGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1245,53 +1239,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB0: +LDGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x16_SAVE - b .LDGEMM_L1x16_SUB2 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SUB1: +LDGEMM_L1x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x16_SAVE + ble LDGEMM_L1x16_SAVE -.LDGEMM_L1x16_SUB2: +LDGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x16_SUB2 + bgt LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SAVE: +LDGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt .LDGEMM_L1x16_BEGIN + bgt LDGEMM_L1x16_BEGIN -.LDGEMM_L1x16_END: +LDGEMM_L1x16_END: -.LDGEMM_L1x8_BEGIN: +LDGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END andi. T1, M, 8 - ble .LDGEMM_L1x8_END + ble LDGEMM_L1x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x8_SUB0 + ble LDGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x8_SUB4 + ble LDGEMM_L1x8_SUB4 -.LDGEMM_L1x8_LOOP_START: +LDGEMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1305,11 +1299,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LDGEMM_L1x8_LOOP_END + ble LDGEMM_L1x8_LOOP_END .align 5 -.LDGEMM_L1x8_LOOP: +LDGEMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1322,9 +1316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LDGEMM_L1x8_LOOP + bgt LDGEMM_L1x8_LOOP -.LDGEMM_L1x8_LOOP_END: +LDGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1336,9 +1330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB4: +LDGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1350,48 +1344,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB0: +LDGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x8_SAVE - b .LDGEMM_L1x8_SUB2 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SUB1: +LDGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x8_SAVE + ble LDGEMM_L1x8_SAVE -.LDGEMM_L1x8_SUB2: +LDGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x8_SUB2 + bgt LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SAVE: +LDGEMM_L1x8_SAVE: SAVE1x8 -.LDGEMM_L1x8_END: +LDGEMM_L1x8_END: -.LDGEMM_L1x4_BEGIN: +LDGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L1x4_END + ble LDGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x4_SUB0 + ble LDGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x4_SUB4 + ble LDGEMM_L1x4_SUB4 -.LDGEMM_L1x4_LOOP_START: +LDGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1405,11 +1399,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LDGEMM_L1x4_LOOP_END + ble LDGEMM_L1x4_LOOP_END .align 5 -.LDGEMM_L1x4_LOOP: +LDGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1422,9 +1416,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LDGEMM_L1x4_LOOP + bgt LDGEMM_L1x4_LOOP -.LDGEMM_L1x4_LOOP_END: +LDGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1436,9 +1430,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB4: +LDGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1450,48 +1444,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB0: +LDGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x4_SAVE - b .LDGEMM_L1x4_SUB2 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SUB1: +LDGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x4_SAVE + ble LDGEMM_L1x4_SAVE -.LDGEMM_L1x4_SUB2: +LDGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x4_SUB2 + bgt LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SAVE: +LDGEMM_L1x4_SAVE: SAVE1x4 -.LDGEMM_L1x4_END: +LDGEMM_L1x4_END: -.LDGEMM_L1x2_BEGIN: +LDGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L1x2_END + ble LDGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x2_SUB0 + ble LDGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x2_SUB4 + ble LDGEMM_L1x2_SUB4 -.LDGEMM_L1x2_LOOP_START: +LDGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1505,11 +1499,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LDGEMM_L1x2_LOOP_END + ble LDGEMM_L1x2_LOOP_END .align 5 -.LDGEMM_L1x2_LOOP: +LDGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1522,9 +1516,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LDGEMM_L1x2_LOOP + bgt LDGEMM_L1x2_LOOP -.LDGEMM_L1x2_LOOP_END: +LDGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1536,9 +1530,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB4: +LDGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1550,48 +1544,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB0: +LDGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x2_SAVE - b .LDGEMM_L1x2_SUB2 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SUB1: +LDGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x2_SAVE + ble LDGEMM_L1x2_SAVE -.LDGEMM_L1x2_SUB2: +LDGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x2_SUB2 + bgt LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SAVE: +LDGEMM_L1x2_SAVE: SAVE1x2 -.LDGEMM_L1x2_END: +LDGEMM_L1x2_END: -.LDGEMM_L1x1_BEGIN: +LDGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x1_SUB0 + ble LDGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x1_SUB4 + ble LDGEMM_L1x1_SUB4 -.LDGEMM_L1x1_LOOP_START: +LDGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1605,11 +1599,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LDGEMM_L1x1_LOOP_END + ble LDGEMM_L1x1_LOOP_END .align 5 -.LDGEMM_L1x1_LOOP: +LDGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1622,9 +1616,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LDGEMM_L1x1_LOOP + bgt LDGEMM_L1x1_LOOP -.LDGEMM_L1x1_LOOP_END: +LDGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1636,9 +1630,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB4: +LDGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1650,34 +1644,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB0: +LDGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x1_SAVE - b .LDGEMM_L1x1_SUB2 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SUB1: +LDGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x1_SAVE + ble LDGEMM_L1x1_SAVE -.LDGEMM_L1x1_SUB2: +LDGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x1_SUB2 + bgt LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SAVE: +LDGEMM_L1x1_SAVE: SAVE1x1 -.LDGEMM_L1x1_END: +LDGEMM_L1x1_END: -.LDGEMM_L1_END: +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 27c05e08e..36531fbe9 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -431,6 +431,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, CO addi T2, T1, 64 + add T3, T1, LDC + addi T4, T3, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 @@ -442,6 +444,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 + + lxvd2x vs8, 0, T3 + lxvd2x vs9, o16, T3 + lxvd2x vs10, o32, T3 + lxvd2x vs11, o48, T3 + + lxvd2x vs12, 0, T4 + lxvd2x vs13, o16, T4 + lxvd2x vs14, o32, T4 + lxvd2x vs15, o48, T4 #endif #ifndef TRMMKERNEL @@ -453,6 +465,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r @@ -462,6 +482,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r #endif stxvd2x vs0, 0, T1 @@ -469,62 +497,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 - dcbt T1, PRE - stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 + stxvd2x vs8, 0, T3 + stxvd2x vs9, o16, T3 + stxvd2x vs10, o32, T3 + stxvd2x vs11, o48, T3 - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif + stxvd2x vs12, 0, T4 + stxvd2x vs13, o16, T4 + stxvd2x vs14, o32, T4 + stxvd2x vs15, o48, T4 -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - dcbt T1, PRE - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC + slwi T4, LDC, 1 + add T1, T1, T4 + add T3, T3, T4 + addi T2, T1, 64 + addi T4, T3, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 @@ -536,6 +528,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 + + lxvd2x vs8, 0, T3 + lxvd2x vs9, o16, T3 + lxvd2x vs10, o32, T3 + lxvd2x vs11, o48, T3 + + lxvd2x vs12, 0, T4 + lxvd2x vs13, o16, T4 + lxvd2x vs14, o32, T4 + lxvd2x vs15, o48, T4 #endif #ifndef TRMMKERNEL @@ -547,6 +549,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs5, vs53, alpha_r xvmaddadp vs6, vs54, alpha_r xvmaddadp vs7, vs55, alpha_r + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r + xvmaddadp vs12, vs60, alpha_r + xvmaddadp vs13, vs61, alpha_r + xvmaddadp vs14, vs62, alpha_r + xvmaddadp vs15, vs63, alpha_r #else xvmuldp vs0, vs48, alpha_r xvmuldp vs1, vs49, alpha_r @@ -556,6 +566,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs5, vs53, alpha_r xvmuldp vs6, vs54, alpha_r xvmuldp vs7, vs55, alpha_r + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r + xvmuldp vs12, vs60, alpha_r + xvmuldp vs13, vs61, alpha_r + xvmuldp vs14, vs62, alpha_r + xvmuldp vs15, vs63, alpha_r #endif stxvd2x vs0, 0, T1 @@ -563,59 +581,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 - dcbt T1, PRE - stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r - xvmaddadp vs10, vs58, alpha_r - xvmaddadp vs11, vs59, alpha_r - xvmaddadp vs12, vs60, alpha_r - xvmaddadp vs13, vs61, alpha_r - xvmaddadp vs14, vs62, alpha_r - xvmaddadp vs15, vs63, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r - xvmuldp vs10, vs58, alpha_r - xvmuldp vs11, vs59, alpha_r - xvmuldp vs12, vs60, alpha_r - xvmuldp vs13, vs61, alpha_r - xvmuldp vs14, vs62, alpha_r - xvmuldp vs15, vs63, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - dcbt T1, PRE + stxvd2x vs8, 0, T3 + stxvd2x vs9, o16, T3 + stxvd2x vs10, o32, T3 + stxvd2x vs11, o48, T3 - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 + stxvd2x vs12, 0, T4 + stxvd2x vs13, o16, T4 + stxvd2x vs14, o32, T4 + stxvd2x vs15, o48, T4 addi CO, CO, 128 diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index f87af535d..eca78bac4 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add B2, B2, B add B1, B1, B - li PREA, 768 + li PREA, 256 addi PREB, M16, 128 li o8, 8 diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S index 776cd3401..28fc74793 100644 --- a/kernel/power/dgemm_tcopy_logic_16_power8.S +++ b/kernel/power/dgemm_tcopy_logic_16_power8.S @@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN: DCOPYT_L4x16_LOOP: +/* addi T1, PREB, 128 addi T2, PREB, 256 +*/ dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA +/* dcbtst BO, M16 dcbtst BO, PREB dcbtst BO, T1 dcbtst BO, T2 +*/ COPY_4x16 add BO, BO, M16 diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 2294128a2..e9dbd991e 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PRE r30 #define T2 r31 -#include "dgemm_macros_16x4_power8.S" +#include "dtrmm_macros_16x4_power8.S" #ifndef NEEDPARAM diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S new file mode 100644 index 000000000..079144a90 --- /dev/null +++ b/kernel/power/dtrmm_macros_16x4_power8.S @@ -0,0 +1,3431 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ + +.macro LOAD4x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + +.endm + +.macro KERNEL4x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.endm + +.macro KERNEL4x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endm + +.macro SAVE4x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r + xvmaddadp vs4, vs52, alpha_r + xvmaddadp vs5, vs53, alpha_r + xvmaddadp vs6, vs54, alpha_r + xvmaddadp vs7, vs55, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r + xvmuldp vs4, vs52, alpha_r + xvmuldp vs5, vs53, alpha_r + xvmuldp vs6, vs54, alpha_r + xvmuldp vs7, vs55, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r + xvmaddadp vs12, vs60, alpha_r + xvmaddadp vs13, vs61, alpha_r + xvmaddadp vs14, vs62, alpha_r + xvmaddadp vs15, vs63, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r + xvmuldp vs12, vs60, alpha_r + xvmuldp vs13, vs61, alpha_r + xvmuldp vs14, vs62, alpha_r + xvmuldp vs15, vs63, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + +.endm + +.macro KERNEL4x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.endm + +.macro KERNEL4x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + diff --git a/param.h b/param.h index aa09f6d61..e693755fa 100644 --- a/param.h +++ b/param.h @@ -410,7 +410,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(STEAMROLLER) || defined(EXCAVATOR) +#ifdef STEAMROLLER +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + + + +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 +#endif + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 576 +#define ZGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 576 +#else +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#endif +#define QGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 160 +#define ZGEMM_DEFAULT_Q 160 +#define CGEMM_DEFAULT_Q 160 +#else +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#endif +#define QGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#define SGEMM_DEFAULT_R 12288 +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R 12288 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + + +#ifdef EXCAVATOR #define SNUMOPT 8 #define DNUMOPT 4 @@ -1885,12 +1978,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 1280 -#define DGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 768 #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 320 #define SGEMM_DEFAULT_Q 640 -#define DGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 768 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640 From 782f75ba94974bdf92e85ae9c978b21f94d6d25f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 27 Apr 2016 15:48:09 +0200 Subject: [PATCH 21/70] optimized param.h for POWER8 --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index e693755fa..b1bce23a0 100644 --- a/param.h +++ b/param.h @@ -1978,12 +1978,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 1280 -#define DGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 640 #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 320 #define SGEMM_DEFAULT_Q 640 -#define DGEMM_DEFAULT_Q 768 +#define DGEMM_DEFAULT_Q 640 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640 From 20b0ed1da598a98b2b5036bc0decbd4aae79aa20 Mon Sep 17 00:00:00 2001 From: buffer51 Date: Wed, 27 Apr 2016 12:09:44 -0700 Subject: [PATCH 22/70] Fixed cross-suffix detection for path that contains dashes when the compiler itself doesn't --- c_check | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/c_check b/c_check index bcf4c2cb3..5242e33a9 100644 --- a/c_check +++ b/c_check @@ -1,5 +1,7 @@ #!/usr/bin/perl +use File::Basename; + # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); @@ -26,14 +28,12 @@ if ($?) { $cross_suffix = ""; -if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { - if ($1 =~ /(.*-)(.*)/) { - $cross_suffix = $1; - } -} else { - if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { - $cross_suffix = $1; - } +if (dirname($compiler_name) ne ".") { + $cross_suffix .= dirname($compiler_name) . "/"; +} + +if (basename($compiler_name) =~ /(.*-)(.*)/) { + $cross_suffix .= $1; } $compiler = ""; From 708dec5bb7fc2b1244934e585fd5e57ccbd66722 Mon Sep 17 00:00:00 2001 From: buffer51 Date: Wed, 27 Apr 2016 22:23:02 -0700 Subject: [PATCH 23/70] Use CROSS_SUFFIX only if CROSS is set --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 5242e33a9..3a5601b1d 100644 --- a/c_check +++ b/c_check @@ -243,7 +243,7 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64; print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; -print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; +print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; From 0fb380c966c071e2af71031b7d82b8bf2e3246a1 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 29 Apr 2016 11:58:15 +0530 Subject: [PATCH 24/70] Update NUMA CPU binding When the number of process can all be accommodated within the current node, then use cores from the current node only. --- driver/others/init.c | 109 ++++++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 26 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index f134f85f7..801f93991 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -361,6 +361,9 @@ static void numa_mapping(void) { unsigned long work, bit; int count = 0; int bitmask_idx = 0; + int current_cpu; + int current_node = 0; + int cpu_count = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; @@ -382,33 +385,84 @@ static void numa_mapping(void) { fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); #endif - h = 1; - - while (h < count) h = 2 * h + 1; - - while (h > 1) { - h /= 2; - for (i = h; i < count; i++) { - work = common -> cpu_info[i]; - bit = CPU_ISSET(i, &cpu_orig_mask[0]); - j = i - h; - while (work < common -> cpu_info[j]) { - common -> cpu_info[j + h] = common -> cpu_info[j]; - if (CPU_ISSET(j, &cpu_orig_mask[0])) { - CPU_SET(j + h, &cpu_orig_mask[0]); - } else { - CPU_CLR(j + h, &cpu_orig_mask[0]); - } - j -= h; - if (j < 0) break; - } - common -> cpu_info[j + h] = work; - if (bit) { - CPU_SET(j + h, &cpu_orig_mask[0]); - } else { - CPU_CLR(j + h, &cpu_orig_mask[0]); + current_cpu = sched_getcpu(); + for (cpu = 0; cpu < count; cpu++) { + if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) { + current_node = READ_NODE(common -> cpu_info[cpu]); + break; + } + } + for (i = 0; i < MAX_BITMASK_LEN; i++) + cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]); + + /* + * If all the processes can be accommodated in the + * in the current node itself, then bind to cores + * from the current node only + */ + if (numprocs <= cpu_count) { + /* + * First sort all the cores in order from the current node. + * Then take remaining nodes one by one in order, + * and sort their cores in order. + */ + for (i = 0; i < count; i++) { + for (j = 0; j < count - 1; j++) { + int node_1, node_2; + int core_1, core_2; + int swap = 0; + + node_1 = READ_NODE(common -> cpu_info[j]); + node_2 = READ_NODE(common -> cpu_info[j + 1]); + core_1 = READ_CORE(common -> cpu_info[j]); + core_2 = READ_CORE(common -> cpu_info[j + 1]); + + if (node_1 == node_2) { + if (core_1 > core_2) + swap = 1; + } else { + if ((node_2 == current_node) || + ((node_1 != current_node) && (node_1 > node_2))) + swap = 1; + } + if (swap) { + unsigned long temp; + + temp = common->cpu_info[j]; + common->cpu_info[j] = common->cpu_info[j + 1]; + common->cpu_info[j + 1] = temp; + } } + } + } else { + h = 1; + + while (h < count) h = 2 * h + 1; + + while (h > 1) { + h /= 2; + for (i = h; i < count; i++) { + work = common -> cpu_info[i]; + bit = CPU_ISSET(i, &cpu_orig_mask[0]); + j = i - h; + while (work < common -> cpu_info[j]) { + common -> cpu_info[j + h] = common -> cpu_info[j]; + if (CPU_ISSET(j, &cpu_orig_mask[0])) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + j -= h; + if (j < 0) break; + } + common -> cpu_info[j + h] = work; + if (bit) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + } } } @@ -416,7 +470,10 @@ static void numa_mapping(void) { fprintf(stderr, "\nSorting ...\n\n"); for (cpu = 0; cpu < count; cpu++) - fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); + fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu], + READ_CPU(common -> cpu_info[cpu]), + READ_CORE(common -> cpu_info[cpu]), + READ_NODE(common -> cpu_info[cpu])); #endif } From 56948dbf0fcbc016f45995301b8c4e2cb673860c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 29 Apr 2016 12:52:47 +0200 Subject: [PATCH 25/70] optimized dgemm for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/dgemm_kernel_16x4_power8.S | 24 +- kernel/power/dgemm_logic_16x4_power8.S | 93 +-- kernel/power/dgemm_macros_16x4_power8.S | 514 ++++++++------- kernel/power/dgemm_ncopy_4_power8.S | 228 +++++++ kernel/power/dgemm_ncopy_logic_4_power8.S | 237 +++++++ kernel/power/dgemm_ncopy_macros_4_power8.S | 691 +++++++++++++++++++++ 7 files changed, 1532 insertions(+), 257 deletions(-) create mode 100644 kernel/power/dgemm_ncopy_4_power8.S create mode 100644 kernel/power/dgemm_ncopy_logic_4_power8.S create mode 100644 kernel/power/dgemm_ncopy_macros_4_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fb07ccffd..8e3d084aa 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -21,7 +21,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index bcc6ce328..8af7fe389 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -134,13 +134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T4 r12 #define T3 r11 +#define o40 r12 +#define o56 r11 + +#define o112 r14 #define o8 r15 #define o24 r16 -#define ALPHA r17 +#define o64 r17 #define L r18 #define T1 r19 -#define KK r20 -#define BB r21 +#define o80 r20 +#define o96 r21 #define I r22 #define J r23 #define AO r24 @@ -205,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -223,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) + stw r14, 212(SP) #endif stfd f1, ALPHA_SP @@ -263,9 +269,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble .L999_H1 #ifdef __64BIT__ - addi ALPHA, SP, 296 + addi T1, SP, 296 #else - addi ALPHA, SP, 224 + addi T1, SP, 224 #endif li PRE, 384 @@ -274,8 +280,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o24, 24 li o32, 32 li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 - lxvdsx alpha_r, 0, ALPHA + lxvdsx alpha_r, 0, T1 #include "dgemm_logic_16x4_power8.S" @@ -323,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -341,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) + lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 4ad3387e8..718f80bdd 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -46,23 +46,28 @@ LDGEMM_L4_BEGIN: srawi. I, M, 4 ble LDGEMM_L4x16_END - .align 5 + .align 4 LDGEMM_L4x16_BEGIN: - li T4, -128 + li L, -128 - and T1, CO, T4 + mr T1, CO add T2, T1, LDC add T3, T2, LDC add T4, T3, LDC + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 - andi. cr0, CO, 127 - ble LDGEMM_L4x16_BEGIN_NOPRE + mr BO, B + srawi. L, K, 1 addi T1, T1, 128 addi T2, T2, 128 @@ -74,55 +79,43 @@ LDGEMM_L4x16_BEGIN: dcbt T3, r0 dcbt T4, r0 - -LDGEMM_L4x16_BEGIN_NOPRE: - - mr BO, B - srawi. L, K, 2 ble LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L4x16_SUB4 - .align 5 + .align 4 LDGEMM_L4x16_LOOP_START: + li o40, 40 + li o56, 56 + dcbt AO, PRE LOAD4x16_1 dcbt AO, PRE KERNEL4x16_I1 dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - addic. L, L, -2 + KERNEL4x16_L2 + ble LDGEMM_L4x16_LOOP_END - .align 7 + .align 4 LDGEMM_L4x16_LOOP: + dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 + KERNEL4x16_L1 dcbt AO, PRE - KERNEL4x16_2 - addic. L, L, -1 + KERNEL4x16_L2 + bgt LDGEMM_L4x16_LOOP - .align 5 + .align 4 + LDGEMM_L4x16_LOOP_END: - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 KERNEL4x16_1 KERNEL4x16_E2 @@ -132,14 +125,12 @@ LDGEMM_L4x16_SUB4: KERNEL4x16_SUBI1 KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 b LDGEMM_L4x16_SUB1 LDGEMM_L4x16_SUB0: - andi. L, K, 3 + andi. L, K, 1 KERNEL4x16_SUBI1 @@ -149,7 +140,7 @@ LDGEMM_L4x16_SUB0: LDGEMM_L4x16_SUB1: - andi. L, K, 3 + andi. L, K, 1 ble LDGEMM_L4x16_SAVE LDGEMM_L4x16_SUB2: @@ -159,7 +150,7 @@ LDGEMM_L4x16_SUB2: addic. L, L, -1 bgt LDGEMM_L4x16_SUB2 - .align 5 + .align 4 LDGEMM_L4x16_SAVE: SAVE4x16 @@ -184,15 +175,20 @@ LDGEMM_L4x8_BEGIN: LDGEMM_L4x8_LOOP_START: + dcbt AO, PRE LOAD4x8_1 KERNEL4x8_I1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 @@ -203,13 +199,17 @@ LDGEMM_L4x8_LOOP_START: LDGEMM_L4x8_LOOP: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 @@ -284,15 +284,18 @@ LDGEMM_L4x4_BEGIN: LDGEMM_L4x4_LOOP_START: + dcbt AO, PRE LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -2 @@ -305,11 +308,13 @@ LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -1 @@ -743,15 +748,20 @@ LDGEMM_L2x8_BEGIN: LDGEMM_L2x8_LOOP_START: + dcbt AO, PRE LOAD2x8_1 KERNEL2x8_I1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 @@ -762,13 +772,17 @@ LDGEMM_L2x8_LOOP_START: LDGEMM_L2x8_LOOP: KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 @@ -1287,15 +1301,20 @@ LDGEMM_L1x8_BEGIN: LDGEMM_L1x8_LOOP_START: + dcbt AO, PRE LOAD1x8_1 KERNEL1x8_I1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 @@ -1306,13 +1325,17 @@ LDGEMM_L1x8_LOOP_START: LDGEMM_L1x8_LOOP: KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 36531fbe9..2c7851207 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO - addi AO, AO, 64 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm + .macro KERNEL4x16_I1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 - addi AO, AO, 64 - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 - addi AO, AO, 64 - addi BO, BO, 32 + addi AO, AO, 128 .endm + + .macro KERNEL4x16_1 xvmaddadp vs32, vs0, vs24 @@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 @@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 - addi AO, AO, 64 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 @@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm @@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 - addi AO, AO, 64 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 @@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm +.macro KERNEL4x16_L1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, o0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 128 + +.endm + +.macro KERNEL4x16_L2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, o32, BO + lxvdsx vs25, o40, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o48, BO + lxvdsx vs27, o56, BO + + xvmaddadp vs60, vs12, vs31 + addi AO, AO, 128 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + addi BO, BO, 64 + xvmaddadp vs63, vs15, vs31 + + +.endm + + .macro KERNEL4x16_E2 @@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 - addi BO, BO, 32 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO - addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 @@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 + addi BO, BO, 32 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 @@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 + addi AO, AO, 128 xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 @@ -430,33 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x16 mr T1, CO - addi T2, T1, 64 - add T3, T1, LDC - addi T4, T3, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 - - lxvd2x vs8, 0, T3 - lxvd2x vs9, o16, T3 - lxvd2x vs10, o32, T3 - lxvd2x vs11, o48, T3 - - lxvd2x vs12, 0, T4 - lxvd2x vs13, o16, T4 - lxvd2x vs14, o32, T4 - lxvd2x vs15, o48, T4 -#endif + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + lxvd2x vs0, 0, CO + lxvd2x vs1, o16, CO + lxvd2x vs2, o32, CO + lxvd2x vs3, o48, CO + lxvd2x vs4, o64, CO + lxvd2x vs5, o80, CO + lxvd2x vs6, o96, CO + lxvd2x vs7, o112, CO + + lxvd2x vs8, 0, T2 + lxvd2x vs9, o16, T2 + lxvd2x vs10, o32, T2 + lxvd2x vs11, o48, T2 + lxvd2x vs12, o64, T2 + lxvd2x vs13, o80, T2 + lxvd2x vs14, o96, T2 + lxvd2x vs15, o112, T2 + + lxvd2x vs24, 0, T3 + lxvd2x vs25, o16, T3 + lxvd2x vs26, o32, T3 + lxvd2x vs27, o48, T3 + lxvd2x vs28, o64, T3 + lxvd2x vs29, o80, T3 + lxvd2x vs30, o96, T3 + lxvd2x vs31, o112, T3 -#ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r @@ -465,138 +599,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r + + lxvd2x vs32, 0, T4 + lxvd2x vs33, o16, T4 + lxvd2x vs34, o32, T4 + lxvd2x vs35, o48, T4 + lxvd2x vs36, o64, T4 + lxvd2x vs37, o80, T4 + lxvd2x vs38, o96, T4 + lxvd2x vs39, o112, T4 + xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - stxvd2x vs8, 0, T3 - stxvd2x vs9, o16, T3 - stxvd2x vs10, o32, T3 - stxvd2x vs11, o48, T3 - - stxvd2x vs12, 0, T4 - stxvd2x vs13, o16, T4 - stxvd2x vs14, o32, T4 - stxvd2x vs15, o48, T4 - - slwi T4, LDC, 1 - add T1, T1, T4 - add T3, T3, T4 - addi T2, T1, 64 - addi T4, T3, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 - - lxvd2x vs8, 0, T3 - lxvd2x vs9, o16, T3 - lxvd2x vs10, o32, T3 - lxvd2x vs11, o48, T3 - - lxvd2x vs12, 0, T4 - lxvd2x vs13, o16, T4 - lxvd2x vs14, o32, T4 - lxvd2x vs15, o48, T4 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r - xvmaddadp vs2, vs50, alpha_r - xvmaddadp vs3, vs51, alpha_r - xvmaddadp vs4, vs52, alpha_r - xvmaddadp vs5, vs53, alpha_r - xvmaddadp vs6, vs54, alpha_r - xvmaddadp vs7, vs55, alpha_r - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r - xvmaddadp vs10, vs58, alpha_r - xvmaddadp vs11, vs59, alpha_r - xvmaddadp vs12, vs60, alpha_r - xvmaddadp vs13, vs61, alpha_r - xvmaddadp vs14, vs62, alpha_r - xvmaddadp vs15, vs63, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r - xvmuldp vs2, vs50, alpha_r - xvmuldp vs3, vs51, alpha_r - xvmuldp vs4, vs52, alpha_r - xvmuldp vs5, vs53, alpha_r - xvmuldp vs6, vs54, alpha_r - xvmuldp vs7, vs55, alpha_r - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r - xvmuldp vs10, vs58, alpha_r - xvmuldp vs11, vs59, alpha_r - xvmuldp vs12, vs60, alpha_r - xvmuldp vs13, vs61, alpha_r - xvmuldp vs14, vs62, alpha_r - xvmuldp vs15, vs63, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 + stxvd2x vs4, o64, T1 + stxvd2x vs5, o80, T1 + stxvd2x vs6, o96, T1 + stxvd2x vs7, o112, T1 + + xvmaddadp vs24, vs48, alpha_r + xvmaddadp vs25, vs49, alpha_r + xvmaddadp vs26, vs50, alpha_r + xvmaddadp vs27, vs51, alpha_r + + stxvd2x vs8, o0, T2 + stxvd2x vs9, o16, T2 + stxvd2x vs10, o32, T2 + stxvd2x vs11, o48, T2 + + xvmaddadp vs28, vs52, alpha_r + xvmaddadp vs29, vs53, alpha_r + xvmaddadp vs30, vs54, alpha_r + xvmaddadp vs31, vs55, alpha_r + + stxvd2x vs12, o64, T2 + stxvd2x vs13, o80, T2 + stxvd2x vs14, o96, T2 + stxvd2x vs15, o112, T2 + + xvmaddadp vs32, vs56, alpha_r + xvmaddadp vs33, vs57, alpha_r + xvmaddadp vs34, vs58, alpha_r + xvmaddadp vs35, vs59, alpha_r + + stxvd2x vs24, 0, T3 + stxvd2x vs25, o16, T3 + stxvd2x vs26, o32, T3 + stxvd2x vs27, o48, T3 + + xvmaddadp vs36, vs60, alpha_r + xvmaddadp vs37, vs61, alpha_r + xvmaddadp vs38, vs62, alpha_r + xvmaddadp vs39, vs63, alpha_r + + stxvd2x vs28, o64, T3 + stxvd2x vs29, o80, T3 + stxvd2x vs30, o96, T3 + stxvd2x vs31, o112, T3 + + stxvd2x vs32, o0, T4 + stxvd2x vs33, o16, T4 + stxvd2x vs34, o32, T4 + stxvd2x vs35, o48, T4 - stxvd2x vs8, 0, T3 - stxvd2x vs9, o16, T3 - stxvd2x vs10, o32, T3 - stxvd2x vs11, o48, T3 + addi CO, CO, 128 - stxvd2x vs12, 0, T4 - stxvd2x vs13, o16, T4 - stxvd2x vs14, o32, T4 - stxvd2x vs15, o48, T4 + stxvd2x vs36, o64, T4 + stxvd2x vs37, o80, T4 + stxvd2x vs38, o96, T4 + stxvd2x vs39, o112, T4 - addi CO, CO, 128 .endm diff --git a/kernel/power/dgemm_ncopy_4_power8.S b/kernel/power/dgemm_ncopy_4_power8.S new file mode 100644 index 000000000..31966047f --- /dev/null +++ b/kernel/power/dgemm_ncopy_4_power8.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define o64 r17 +#define o80 r18 +#define o96 r19 +#define o112 r20 +#define o8 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTU1 r27 +#define NOTU2 r30 +#define T1 r31 + +#define o0 0 + +#include "dgemm_ncopy_macros_4_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + + li PREA, 384 + li PREB, 384 + + li o8, 8 + li o16, 16 + li o32, 32 + li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 + +#include "dgemm_ncopy_logic_4_power8.S" + +L999: + + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/dgemm_ncopy_logic_4_power8.S b/kernel/power/dgemm_ncopy_logic_4_power8.S new file mode 100644 index 000000000..6944a7818 --- /dev/null +++ b/kernel/power/dgemm_ncopy_logic_4_power8.S @@ -0,0 +1,237 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + mr BO, B + srawi. I, N, 2 + ble DCOPYN_L2_BEGIN + + +DCOPYN_L4_BEGIN: + + +DCOPYN_L4_LOOP: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + +DCOPYN_L4x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L4x16_END + +DCOPYN_L4x16_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x16 + addic. J, J, -1 + bgt DCOPYN_L4x16_LOOP + +DCOPYN_L4x16_END: + + +DCOPYN_L4x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L4x8_END + COPY_4x8 + +DCOPYN_L4x8_END: + + +DCOPYN_L4x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L4x4_END + COPY_4x4 + +DCOPYN_L4x4_END: + + +DCOPYN_L4x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L4x2_END + COPY_4x2 + +DCOPYN_L4x2_END: + + +DCOPYN_L4x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L4x1_END + COPY_4x1 + +DCOPYN_L4x1_END: + + +DCOPYN_L4_END: + + addic. I, I, -1 + bgt DCOPYN_L4_LOOP + +DCOPYN_L2_BEGIN: + + andi. T1, 4, 2 + ble DCOPYN_L2_END + +DCOPYN_L2_LOOP: + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + +DCOPYN_L2x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L2x16_END + +DCOPYN_L2x16_LOOP: + + COPY_2x16 + addic. J, J, -1 + bgt DCOPYN_L2x16_LOOP + +DCOPYN_L2x16_END: + + +DCOPYN_L2x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L2x8_END + COPY_2x8 + +DCOPYN_L2x8_END: + + +DCOPYN_L2x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L2x4_END + COPY_2x4 + +DCOPYN_L2x4_END: + + +DCOPYN_L2x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L2x2_END + COPY_2x2 + +DCOPYN_L2x2_END: + + +DCOPYN_L2x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L2x1_END + COPY_2x1 + +DCOPYN_L2x1_END: + + +DCOPYN_L2_END: + + +DCOPYN_L1_BEGIN: + + andi. T1, 4, 1 + ble DCOPYN_L1_END + +DCOPYN_L1_LOOP: + + mr A0, A + add A, A0, LDA + +DCOPYN_L1x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L1x16_END + +DCOPYN_L1x16_LOOP: + + COPY_1x16 + addic. J, J, -1 + bgt DCOPYN_L1x16_LOOP + +DCOPYN_L1x16_END: + + +DCOPYN_L1x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L1x8_END + COPY_1x8 + +DCOPYN_L1x8_END: + + +DCOPYN_L1x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L1x4_END + COPY_1x4 + +DCOPYN_L1x4_END: + + +DCOPYN_L1x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L1x2_END + COPY_1x2 + +DCOPYN_L1x2_END: + + +DCOPYN_L1x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L1x1_END + COPY_1x1 + +DCOPYN_L1x1_END: + + +DCOPYN_L1_END: + diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S new file mode 100644 index 000000000..9b07d73f5 --- /dev/null +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -0,0 +1,691 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs8, o0, A1 + lxvd2x vs24, o0, A3 + lxvd2x vs16, o0, A2 + + lxvd2x vs1, o16, A0 + lxvd2x vs9, o16, A1 + lxvd2x vs17, o16, A2 + lxvd2x vs25, o16, A3 + + lxvd2x vs2, o32, A0 + lxvd2x vs10, o32, A1 + lxvd2x vs18, o32, A2 + lxvd2x vs26, o32, A3 + + lxvd2x vs3, o48, A0 + lxvd2x vs11, o48, A1 + lxvd2x vs19, o48, A2 + lxvd2x vs27, o48, A3 + + lxvd2x vs4, o64, A0 + lxvd2x vs12, o64, A1 + lxvd2x vs20, o64, A2 + lxvd2x vs28, o64, A3 + + lxvd2x vs5, o80, A0 + lxvd2x vs13, o80, A1 + lxvd2x vs21, o80, A2 + lxvd2x vs29, o80, A3 + + lxvd2x vs6, o96, A0 + lxvd2x vs14, o96, A1 + lxvd2x vs22, o96, A2 + lxvd2x vs30, o96, A3 + + lxvd2x vs7, o112, A0 + lxvd2x vs15, o112, A1 + lxvd2x vs23, o112, A2 + lxvd2x vs31, o112, A3 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + xxpermdi vs48, vs4, vs12, 0 + xxpermdi vs49, vs20, vs28, 0 + xxpermdi vs50, vs4, vs12, 3 + xxpermdi vs51, vs20, vs28, 3 + + xxpermdi vs52, vs5, vs13, 0 + xxpermdi vs53, vs21, vs29, 0 + xxpermdi vs54, vs5, vs13, 3 + xxpermdi vs55, vs21, vs29, 3 + + addi A0, A0, 128 + addi A1, A1, 128 + + xxpermdi vs56, vs6, vs14, 0 + xxpermdi vs57, vs22, vs30, 0 + xxpermdi vs58, vs6, vs14, 3 + xxpermdi vs59, vs22, vs30, 3 + + addi A3, A3, 128 + addi A2, A2, 128 + + xxpermdi vs60, vs7, vs15, 0 + xxpermdi vs61, vs23, vs31, 0 + xxpermdi vs62, vs7, vs15, 3 + xxpermdi vs63, vs23, vs31, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + stxvd2x vs48, o0, BO + stxvd2x vs49, o16, BO + stxvd2x vs50, o32, BO + stxvd2x vs51, o48, BO + stxvd2x vs52, o64, BO + stxvd2x vs53, o80, BO + stxvd2x vs54, o96, BO + stxvd2x vs55, o112, BO + addi BO, BO, 128 + + stxvd2x vs56, o0, BO + stxvd2x vs57, o16, BO + stxvd2x vs58, o32, BO + stxvd2x vs59, o48, BO + stxvd2x vs60, o64, BO + stxvd2x vs61, o80, BO + stxvd2x vs62, o96, BO + stxvd2x vs63, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + lxvd2x vs18, o32, A2 + lxvd2x vs19, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + lxvd2x vs26, o32, A3 + lxvd2x vs27, o48, A3 + addi A3, A3, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + addi A3, A3, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs16, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs24, o0, A3 + addi A3, A3, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + lxsdx vs16, o0, A2 + addi A2, A2, 8 + + + lxsdx vs24, o0, A3 + addi A3, A3, 8 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + lxvd2x vs12, o64, A1 + lxvd2x vs13, o80, A1 + lxvd2x vs14, o96, A1 + lxvd2x vs15, o112, A1 + addi A1, A1, 128 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + xxpermdi vs40, vs4, vs12, 0 + xxpermdi vs41, vs4, vs12, 3 + + xxpermdi vs42, vs5, vs13, 0 + xxpermdi vs43, vs5, vs13, 3 + + xxpermdi vs44, vs6, vs14, 0 + xxpermdi vs45, vs6, vs14, 3 + + xxpermdi vs46, vs7, vs15, 0 + xxpermdi vs47, vs7, vs15, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + xxpermdi vs32, vs0, vs8, 0 + + + stxvd2x vs32, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + stxvd2x vs4, o0, BO + stxvd2x vs5, o16, BO + stxvd2x vs6, o32, BO + stxvd2x vs7, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + stxvd2x vs0, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + stxsdx vs0, o0, BO + addi BO, BO, 8 + + +.endm + From b7b3d8ec8ee178edd980e97a7138e2d03f7a55b7 Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Tue, 3 May 2016 14:42:26 +0530 Subject: [PATCH 26/70] DGEMM optimization for MIPS P5600 and I6400 using MSA Signed-off-by: Shivraj Patil --- CONTRIBUTORS.md | 3 + kernel/mips/KERNEL.P5600 | 10 +- kernel/mips/dgemm_kernel_8x4_msa.c | 720 +++++++++++++++++++++++++++++ kernel/mips/dgemm_ncopy_4_msa.c | 135 ++++++ kernel/mips/dgemm_ncopy_8_msa.c | 228 +++++++++ kernel/mips/dgemm_tcopy_4_msa.c | 162 +++++++ kernel/mips/dgemm_tcopy_8_msa.c | 317 +++++++++++++ kernel/mips/macros_msa.h | 79 ++++ param.h | 8 +- 9 files changed, 1655 insertions(+), 7 deletions(-) create mode 100644 kernel/mips/dgemm_kernel_8x4_msa.c create mode 100644 kernel/mips/dgemm_ncopy_4_msa.c create mode 100644 kernel/mips/dgemm_ncopy_8_msa.c create mode 100644 kernel/mips/dgemm_tcopy_4_msa.c create mode 100644 kernel/mips/dgemm_tcopy_8_msa.c create mode 100644 kernel/mips/macros_msa.h diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ebe52ea8a..4431103bd 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -150,3 +150,6 @@ In chronological order: * theoractice * [2016-03-20] Fix compiler error in VisualStudio with CMake * [2016-03-22] Fix access violation on Windows while static linking + +* Shivraj Patil + * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 09064fe48..d21575251 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -91,9 +91,13 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c new file mode 100644 index 000000000..8d9e3455e --- /dev/null +++ b/kernel/mips/dgemm_kernel_8x4_msa.c @@ -0,0 +1,720 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, + FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l; + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pa0, *pb0; + FLOAT tmp0, tmp1, tmp2, tmp3; + FLOAT a0; + FLOAT b0, b1, b2, b3; + v2f64 v_alpha = {alpha, alpha}; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v2f64 res0, res1, res2, res3, res4, res5, res6, res7; + v2f64 res8, res9, res10, res11, res12, res13, res14, res15; + + for (j = (n / 4); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + res6 = src_a2 * src_b; + res7 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 = src_a0 * src_b; + res9 = src_a1 * src_b; + res10 = src_a2 * src_b; + res11 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 = src_a0 * src_b; + res13 = src_a1 * src_b; + res14 = src_a2 * src_b; + res15 = src_a3 * src_b; + + pa0 += 8; + pb0 += 4; + + for (l = (k - 1); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + res10 += src_a2 * src_b; + res11 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + res14 += src_a2 * src_b; + res15 += src_a3 * src_b; + + pa0 += 8; + pb0 += 4; + } + + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); + + LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); + + dst0 += res8 * v_alpha; + dst1 += res9 * v_alpha; + dst2 += res10 * v_alpha; + dst3 += res11 * v_alpha; + dst4 += res12 * v_alpha; + dst5 += res13 * v_alpha; + dst6 += res14 * v_alpha; + dst7 += res15 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc2, 2); + ST_DP4(dst4, dst5, dst6, dst7, pc3, 2); + + pc0 += 8; + pc1 += 8; + pc2 += 8; + pc3 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 = src_a0 * src_b; + res7 = src_a1 * src_b; + + pa0 += 4; + pb0 += 4; + + for (l = (k - 1); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pa0 += 4; + pb0 += 4; + } + + LD_DP2(pc0, 2, dst0, dst1); + LD_DP2(pc1, 2, dst2, dst3); + LD_DP2(pc2, 2, dst4, dst5); + LD_DP2(pc3, 2, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_DP2(dst0, dst1, pc0, 2); + ST_DP2(dst2, dst3, pc1, 2); + ST_DP2(dst4, dst5, pc2, 2); + ST_DP2(dst6, dst7, pc3, 2); + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + src_a0 = LD_DP(pa0); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 = src_a0 * src_b; + + pa0 += 2; + pb0 += 4; + + for (l = (k - 1); l--;) + { + src_a0 = LD_DP(pa0); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 += src_a0 * src_b; + + pa0 += 2; + pb0 += 4; + } + + dst0 = LD_DP(pc0); + dst1 = LD_DP(pc1); + dst2 = LD_DP(pc2); + dst3 = LD_DP(pc3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_DP(dst0, pc0); + ST_DP(dst1, pc1); + ST_DP(dst2, pc2); + ST_DP(dst3, pc3); + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + b2 = pb0[2]; + tmp2 = a0 * b2; + + b3 = pb0[3]; + tmp3 = a0 * b3; + + pa0 += 1; + pb0 += 4; + + for (l = (k - 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + + pc0[0] += tmp0; + pc1[0] += tmp1; + pc2[0] += tmp2; + pc3[0] += tmp3; + + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + } + + l = (k << 2); + B = B + l; + i = (ldc << 2); + C = C + i; + } + + for (j = ((n & 2) / 2); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + res6 = src_a2 * src_b; + res7 = src_a3 * src_b; + + pa0 += 8; + pb0 += 2; + + for (l = (k - 1); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + pa0 += 8; + pb0 += 2; + } + + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); + + pc0 += 8; + pc1 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + LD_DP2(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + pa0 += 4; + pb0 += 2; + + for (l = (k - 1); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pa0 += 4; + pb0 += 2; + } + + LD_DP2(pc0, 2, dst0, dst1); + LD_DP2(pc1, 2, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_DP2(dst0, dst1, pc0, 2); + ST_DP2(dst2, dst3, pc1, 2); + + pc0 += 4; + pc1 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + src_a0 = LD_DP(pa0); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 = src_a0 * src_b; + + pa0 += 2; + pb0 += 2; + + for (l = (k - 1); l--;) + { + src_a0 = LD_DP(pa0); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + pa0 += 2; + pb0 += 2; + } + + dst0 = LD_DP(pc0); + dst1 = LD_DP(pc1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + + ST_DP(dst0, pc0); + ST_DP(dst1, pc1); + + pc0 += 2; + pc1 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + pa0 += 1; + pb0 += 2; + + for (l = (k - 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + + pc0[0] += tmp0; + pc1[0] += tmp1; + + pc0 += 1; + pc1 += 1; + } + + l = (k << 1); + B = B + l; + i = (ldc << 1); + C = C + i; + } + + for (j = (n & 1); j--;) + { + pc0 = C; + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + pa0 += 8; + pb0 += 1; + + for (l = (k - 1); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + pa0 += 8; + pb0 += 1; + } + + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); + + pc0 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + LD_DP2(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + pa0 += 4; + pb0 += 1; + + for (l = (k - 1); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pa0 += 4; + pb0 += 1; + } + + LD_DP2(pc0, 2, dst0, dst1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + + ST_DP2(dst0, dst1, pc0, 2); + + pc0 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + + for (l = (k - 1); l--;) + { + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + } + + dst0 = LD_DP(pc0); + + dst0 += res0 * v_alpha; + + ST_DP(dst0, pc0); + + pc0 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + pa0 += 1; + pb0 += 1; + + for (l = (k - 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + + pc0[0] += alpha * tmp0; + + pc0 += 1; + } + + l = (k << 0); + B = B + l; + i = (ldc << 0); + C = C + i; + } + return 0; +} diff --git a/kernel/mips/dgemm_ncopy_4_msa.c b/kernel/mips/dgemm_ncopy_4_msa.c new file mode 100644 index 000000000..bbd76070f --- /dev/null +++ b/kernel/mips/dgemm_ncopy_4_msa.c @@ -0,0 +1,135 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + + ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); + pdst += 16; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + psrc1 += 4; + psrc2 += 4; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + + ST_DP4(dst0, dst4, dst1, dst5, pdst, 2); + pdst += 8; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + psrc1 += 4; + + ST_DP2(src0, src1, pdst, 2); + pdst += 4; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_ncopy_8_msa.c b/kernel/mips/dgemm_ncopy_8_msa.c new file mode 100644 index 000000000..43c977582 --- /dev/null +++ b/kernel/mips/dgemm_ncopy_8_msa.c @@ -0,0 +1,228 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *psrc5, *psrc6, *psrc7, *psrc8; + FLOAT *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 3); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + LD_DP2(psrc5, 2, src8, src9); + LD_DP2(psrc6, 2, src10, src11); + LD_DP2(psrc7, 2, src12, src13); + LD_DP2(psrc8, 2, src14, src15); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16, + 2); + + LD_DP2(psrc1 + 4, 2, src0, src1); + LD_DP2(psrc2 + 4, 2, src2, src3); + LD_DP2(psrc3 + 4, 2, src4, src5); + LD_DP2(psrc4 + 4, 2, src6, src7); + LD_DP2(psrc5 + 4, 2, src8, src9); + LD_DP2(psrc6 + 4, 2, src10, src11); + LD_DP2(psrc7 + 4, 2, src12, src13); + LD_DP2(psrc8 + 4, 2, src14, src15); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32, + 2); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48, + 2); + + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + pdst += 64; + } + + for (i = (m & 7); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + *pdst++ = *psrc5++; + *pdst++ = *psrc6++; + *pdst++ = *psrc7++; + *pdst++ = *psrc8++; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + + ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); + pdst += 16; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 1); i--;) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0); + dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0); + + ST_DP2(dst0, dst1, pdst, 2); + pdst += 4; + } + + if (m & 1) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = m; i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_tcopy_4_msa.c b/kernel/mips/dgemm_tcopy_4_msa.c new file mode 100644 index 000000000..f147d190e --- /dev/null +++ b/kernel/mips/dgemm_tcopy_4_msa.c @@ -0,0 +1,162 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~3); + pdst3 = dst + m * (n & ~1); + + for (j = (m >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + pdst1 += m * 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4(src0, src1, src2, src3, pdst2, 2); + pdst2 += 8; + } + + if (n & 1) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + psrc1 += 4; + psrc2 += 4; + + ST_DP4(src0, src1, src2, src3, pdst1, 2); + pdst1 += m * 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2(src0, src1, pdst2, 2); + pdst2 += 4; + } + + if (n & 1) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + pdst1 = pdst0; + + for (i = (n >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + psrc1 += 4; + + ST_DP2(src0, src1, pdst1, 2); + pdst1 += 4 * m; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + psrc1 += 2; + + ST_DP(src0, pdst2); + } + + if (n & 1) + { + *pdst3 = *psrc1; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_tcopy_8_msa.c b/kernel/mips/dgemm_tcopy_8_msa.c new file mode 100644 index 000000000..d1ac49b5a --- /dev/null +++ b/kernel/mips/dgemm_tcopy_8_msa.c @@ -0,0 +1,317 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *psrc5, *psrc6, *psrc7, *psrc8; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~7); + pdst3 = dst + m * (n & ~3); + pdst4 = dst + m * (n & ~1); + + for (j = (m >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + pdst1 = pdst0; + pdst0 += 64; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + LD_DP4(psrc2, 2, src4, src5, src6, src7); + LD_DP4(psrc3, 2, src8, src9, src10, src11); + LD_DP4(psrc4, 2, src12, src13, src14, src15); + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + + LD_DP4(psrc5, 2, src0, src1, src2, src3); + LD_DP4(psrc6, 2, src4, src5, src6, src7); + LD_DP4(psrc7, 2, src8, src9, src10, src11); + LD_DP4(psrc8, 2, src12, src13, src14, src15); + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, + 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 48, 2); + pdst1 += m * 8; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + LD_DP2(psrc5, 2, src8, src9); + LD_DP2(psrc6, 2, src10, src11); + LD_DP2(psrc7, 2, src12, src13); + LD_DP2(psrc8, 2, src14, src15); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + psrc5 += 4; + psrc6 += 4; + psrc7 += 4; + psrc8 += 4; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst2 + 16, 2); + pdst2 += 32; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + src4 = LD_DP(psrc5); + src5 = LD_DP(psrc6); + src6 = LD_DP(psrc7); + src7 = LD_DP(psrc8); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + psrc5 += 2; + psrc6 += 2; + psrc7 += 2; + psrc8 += 2; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); + pdst3 += 16; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + *pdst4++ = *psrc5++; + *pdst4++ = *psrc6++; + *pdst4++ = *psrc7++; + *pdst4++ = *psrc8++; + } + } + + if (m & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + LD_DP4(psrc2, 2, src4, src5, src6, src7); + LD_DP4(psrc3, 2, src8, src9, src10, src11); + LD_DP4(psrc4, 2, src12, src13, src14, src15); + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + pdst2 += 16; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4(src0, src1, src2, src3, pdst3, 2); + pdst3 += 8; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + LD_DP4(psrc2, 2, src4, src5, src6, src7); + psrc1 += 8; + psrc2 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + psrc1 += 4; + psrc2 += 4; + + ST_DP4(src0, src1, src2, src3, pdst2, 2); + pdst2 += 8; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2(src0, src1, pdst3, 2); + pdst3 += 4; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + psrc0 += lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + psrc1 += 8; + + ST_DP4(src0, src1, src2, src3, pdst1, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + psrc1 += 4; + + ST_DP2(src0, src1, pdst2, 2); + pdst2 += 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + psrc1 += 2; + + ST_DP(src0, pdst3); + pdst3 += 2; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h new file mode 100644 index 000000000..3bcc59629 --- /dev/null +++ b/kernel/mips/macros_msa.h @@ -0,0 +1,79 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef __MACROS_MSA_H__ +#define __MACROS_MSA_H__ + +#include + +#define LD_D(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_DP(...) LD_D(v2f64, __VA_ARGS__) + +#define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_DP(...) ST_D(v2f64, __VA_ARGS__) + +/* Description : Load 2 vectors of double precision floating point elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - double precision floating point +*/ +#define LD_DP2(psrc, stride, out0, out1) \ +{ \ + out0 = LD_DP((psrc)); \ + out1 = LD_DP((psrc) + stride); \ +} + +#define LD_DP4(psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_DP2(psrc, stride, out0, out1) \ + LD_DP2(psrc + 2 * stride, stride, out2, out3) \ +} + +/* Description : Store vectors of double precision floating point elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 2 double precision floating point elements from 'in0' to (pdst) + Store 2 double precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_DP2(in0, in1, pdst, stride) \ +{ \ + ST_DP(in0, (pdst)); \ + ST_DP(in1, (pdst) + stride); \ +} + +#define ST_DP4(in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_DP2(in0, in1, (pdst), stride); \ + ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \ +} + +#define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + ST_DP4(in0, in1, in2, in3, (pdst), stride); \ + ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} + +#endif /* __MACROS_MSA_H__ */ diff --git a/param.h b/param.h index 93b1220d6..6948e6a76 100644 --- a/param.h +++ b/param.h @@ -2185,8 +2185,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -2224,8 +2224,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 From efaf30d536374a8a86147d6d76c79f91cf6d55b1 Mon Sep 17 00:00:00 2001 From: Ivan Ukhov Date: Tue, 3 May 2016 21:31:32 +0200 Subject: [PATCH 27/70] Wrap CURDIR and DESTDIR in quotes --- Makefile.install | 98 ++++++++++++++++++++++++------------------------ exports/Makefile | 4 +- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/Makefile.install b/Makefile.install index 5da4e68c9..1b9388a8b 100644 --- a/Makefile.install +++ b/Makefile.install @@ -20,75 +20,75 @@ lib.grd : $(error OpenBLAS: Please run "make" firstly) install : lib.grd - @-mkdir -p $(DESTDIR)$(PREFIX) - @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @-mkdir -p "$(DESTDIR)$(PREFIX)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) #for inc - @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h + @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h + @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) - @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) - @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) - @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) - @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) - @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) - @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" endif ifeq ($(OSNAME), CYGWIN_NT) @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) @@ -96,34 +96,34 @@ endif endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) - @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), Darwin) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif else #only static - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif #Generating OpenBLASConfigVersion.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! diff --git a/exports/Makefile b/exports/Makefile index c2b8d9c1c..5632b6fff 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) #only build without Fortran - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c From 573d9218f25071277f71220c6446a3e0c9459ca8 Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Mon, 9 May 2016 14:45:12 +0530 Subject: [PATCH 28/70] build fix for MIPS 32 bit Signed-off-by: Shivraj Patil --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index 4c9d81e9f..4c03ac768 100644 --- a/f_check +++ b/f_check @@ -223,7 +223,7 @@ if (!$?) { } #For gfortran MIPS if ($?) { - $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } $binary = "" if ($?); } From edb5980c1366f528645361b55b7e90758fa5949c Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Mon, 9 May 2016 15:15:26 +0530 Subject: [PATCH 29/70] DTRSM optimization for MIPS P5600 and I6400 using MSA Signed-off-by: Kaustubh Raste --- CONTRIBUTORS.md | 3 + kernel/mips/KERNEL.P5600 | 8 +- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 1416 +++++++++++++++++++++++++ kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 1397 ++++++++++++++++++++++++ kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 963 +++++++++++++++++ kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 866 +++++++++++++++ 6 files changed, 4649 insertions(+), 4 deletions(-) create mode 100644 kernel/mips/dtrsm_kernel_LN_8x4_msa.c create mode 100644 kernel/mips/dtrsm_kernel_LT_8x4_msa.c create mode 100644 kernel/mips/dtrsm_kernel_RN_8x4_msa.c create mode 100644 kernel/mips/dtrsm_kernel_RT_8x4_msa.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 999413be2..a13308f71 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -157,3 +157,6 @@ In chronological order: * Shivraj Patil * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA + +* Kaustubh Raste + * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index d21575251..0ac30d77c 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -118,10 +118,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c new file mode 100644 index 000000000..9f0eb95a5 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -0,0 +1,1416 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; + v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; + v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; + v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; + v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; + v2f64 src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pbb, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + pba += 8; + pbb += 4; + + LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); + LD_DP2(pbb, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a8; + src_a1 = src_a9; + src_a2 = src_a16; + src_a3 = src_a17; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } + + a -= 64; + b -= 32; + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); + res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); + res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); + res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); + res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); + res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); + res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); + res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + + src_a54 = __msa_cast_to_vector_double(*(a + 54)); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a62 = LD_DP(a + 62); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); + src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); + src_a60 = LD_DP(a + 60); + src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); + src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); + src_a52 = LD_DP(a + 52); + src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); + src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); + src_a44 = LD_DP(a + 44); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); + src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); + src_a36 = __msa_cast_to_vector_double(*(a + 36)); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c6 *= src_a54; + + res_c15 *= src_a63; + res_c14 -= res_c15 * src_a62; + res_c14 *= src_a54; + + ST_DP(res_c7, b + 28); + ST_DP(res_c6, b + 24); + ST_DP(res_c15, b + 30); + ST_DP(res_c14, b + 26); + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); + src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + ST_DP(src_c3, c + 6); + ST_DP(src_c7, c_nxt1line + 6); + ST_DP(src_c11, c_nxt2line + 6); + ST_DP(src_c15, c_nxt3line + 6); + + res_c5 -= res_c7 * src_a61; + res_c5 -= res_c6 * src_a53; + res_c5 *= src_a45; + + res_c4 -= res_c7 * src_a60; + res_c4 -= res_c6 * src_a52; + res_c4 -= res_c5 * src_a44; + res_c4 *= src_a36; + + res_c13 -= res_c15 * src_a61; + res_c13 -= res_c14 * src_a53; + res_c13 *= src_a45; + + res_c12 -= res_c15 * src_a60; + res_c12 -= res_c14 * src_a52; + res_c12 -= res_c13 * src_a44; + res_c12 *= src_a36; + + src_a56 = LD_DP(a + 56); + src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); + src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); + src_a58 = LD_DP(a + 58); + src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); + src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); + + ST_DP(res_c4, b + 16); + ST_DP(res_c5, b + 20); + ST_DP(res_c12, b + 18); + ST_DP(res_c13, b + 22); + + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); + src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + ST_DP(src_c2, c + 4); + ST_DP(src_c6, c_nxt1line + 4); + ST_DP(src_c10, c_nxt2line + 4); + ST_DP(src_c14, c_nxt3line + 4); + + src_a50 = LD_DP(a + 50); + src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); + src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); + src_a42 = LD_DP(a + 42); + src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); + src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); + src_a34 = LD_DP(a + 34); + src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); + src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); + src_a26 = LD_DP(a + 26); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); + src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); + src_a18 = __msa_cast_to_vector_double(*(a + 18)); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + res_c11 -= res_c15 * src_a59; + res_c10 -= res_c15 * src_a58; + res_c9 -= res_c15 * src_a57; + res_c8 -= res_c15 * src_a56; + + res_c3 -= res_c6 * src_a51; + res_c3 -= res_c5 * src_a43; + res_c3 -= res_c4 * src_a35; + res_c3 *= src_a27; + + res_c2 -= res_c6 * src_a50; + res_c2 -= res_c5 * src_a42; + res_c2 -= res_c4 * src_a34; + res_c2 -= res_c3 * src_a26; + res_c2 *= src_a18; + + res_c11 -= res_c14 * src_a51; + res_c11 -= res_c13 * src_a43; + res_c11 -= res_c12 * src_a35; + res_c11 *= src_a27; + + res_c10 -= res_c14 * src_a50; + res_c10 -= res_c13 * src_a42; + res_c10 -= res_c12 * src_a34; + res_c10 -= res_c11 * src_a26; + res_c10 *= src_a18; + + src_a48 = LD_DP(a + 48); + src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); + src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); + src_a40 = LD_DP(a + 40); + src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); + src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); + + ST_DP(res_c2, b + 8); + ST_DP(res_c3, b + 12); + ST_DP(res_c10, b + 10); + ST_DP(res_c11, b + 14); + + src_a32 = LD_DP(a + 32); + src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); + src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); + src_a24 = LD_DP(a + 24); + src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); + src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); + + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); + src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + ST_DP(src_c1, c + 2); + ST_DP(src_c5, c_nxt1line + 2); + ST_DP(src_c9, c_nxt2line + 2); + ST_DP(src_c13, c_nxt3line + 2); + + res_c1 -= res_c6 * src_a49; + res_c1 -= res_c5 * src_a41; + res_c1 -= res_c4 * src_a33; + res_c1 -= res_c3 * src_a25; + + res_c0 -= res_c6 * src_a48; + res_c0 -= res_c5 * src_a40; + res_c0 -= res_c4 * src_a32; + res_c0 -= res_c3 * src_a24; + + res_c9 -= res_c14 * src_a49; + res_c9 -= res_c13 * src_a41; + res_c9 -= res_c12 * src_a33; + res_c9 -= res_c11 * src_a25; + + res_c8 -= res_c14 * src_a48; + res_c8 -= res_c13 * src_a40; + res_c8 -= res_c12 * src_a32; + res_c8 -= res_c11 * src_a24; + + src_a16 = LD_DP(a + 16); + src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); + src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); + src_a8 = LD_DP(a + 8); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + res_c1 -= res_c2 * src_a17; + res_c1 *= src_a9; + + res_c9 -= res_c10 * src_a17; + res_c9 *= src_a9; + + res_c0 -= res_c2 * src_a16; + res_c0 -= res_c1 * src_a8; + res_c0 *= src_a0; + + res_c8 -= res_c10 * src_a16; + res_c8 -= res_c9 * src_a8; + res_c8 *= src_a0; + + ST_DP(res_c0, b + 0); + ST_DP(res_c8, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c9, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); + src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + + ST_DP(src_c0, c); + ST_DP(src_c4, c_nxt1line); + ST_DP(src_c8, c_nxt2line); + ST_DP(src_c12, c_nxt3line); +} + +static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; + v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; + v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; + v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; + v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; + v2f64 src_a61, src_a62, src_a63; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pbb); + + for (i = bk - 1; i--;) + { + pba += 8; + pbb += 2; + + LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); + src_b1 = LD_DP(pbb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a8; + src_a1 = src_a9; + src_a2 = src_a16; + src_a3 = src_a17; + src_b0 = src_b1; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + + src_a56 = LD_DP(a - 8); + src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); + src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); + src_a58 = LD_DP(a - 6); + src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); + src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); + src_a60 = LD_DP(a - 4); + src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); + src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); + src_a62 = LD_DP(a - 2); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); + src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c5 -= res_c7 * src_a61; + res_c4 -= res_c7 * src_a60; + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + src_a48 = LD_DP(a - 16); + src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); + src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); + src_a50 = LD_DP(a - 14); + src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); + src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); + src_a52 = LD_DP(a - 12); + src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); + src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); + src_a54 = __msa_cast_to_vector_double(*(a - 10)); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + src_a40 = LD_DP(a - 24); + src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); + src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); + src_a42 = LD_DP(a - 22); + src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); + src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); + src_a44 = LD_DP(a - 20); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); + src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); + + res_c6 *= src_a54; + res_c5 -= res_c6 * src_a53; + res_c4 -= res_c6 * src_a52; + res_c3 -= res_c6 * src_a51; + res_c2 -= res_c6 * src_a50; + res_c1 -= res_c6 * src_a49; + res_c0 -= res_c6 * src_a48; + + res_c5 *= src_a45; + res_c4 -= res_c5 * src_a44; + res_c3 -= res_c5 * src_a43; + res_c2 -= res_c5 * src_a42; + res_c1 -= res_c5 * src_a41; + res_c0 -= res_c5 * src_a40; + + ST_DP(res_c7, b - 2); + ST_DP(res_c6, b - 4); + ST_DP(res_c5, b - 6); + + src_a32 = LD_DP(a - 32); + src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); + src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); + src_a34 = LD_DP(a - 30); + src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); + src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); + src_a36 = __msa_cast_to_vector_double(*(a - 28)); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + + res_c4 *= src_a36; + res_c3 -= res_c4 * src_a35; + res_c2 -= res_c4 * src_a34; + res_c1 -= res_c4 * src_a33; + res_c0 -= res_c4 * src_a32; + + src_a24 = LD_DP(a - 40); + src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); + src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); + src_a26 = LD_DP(a - 38); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); + src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); + src_a16 = LD_DP(a - 48); + src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); + src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); + src_a18 = __msa_cast_to_vector_double(*(a - 46)); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a0 = __msa_cast_to_vector_double(*(a - 64)); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = LD_DP(a - 56); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + + res_c3 *= src_a27; + res_c2 -= res_c3 * src_a26; + res_c1 -= res_c3 * src_a25; + res_c0 -= res_c3 * src_a24; + + res_c2 *= src_a18; + res_c1 -= res_c2 * src_a17; + res_c0 -= res_c2 * src_a16; + + res_c1 *= src_a9; + res_c0 -= res_c1 * src_a8; + + res_c0 *= src_a0; + + ST_DP(res_c4, b - 8); + ST_DP(res_c3, b - 10); + ST_DP(res_c2, b - 12); + ST_DP(res_c1, b - 14); + ST_DP(res_c0, b - 16); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); +} + +static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + int i; + FLOAT *aa = a, *bb = b; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; + + for (i = bk; i--; ) + { + a0 = aa[0]; + a1 = aa[1]; + a2 = aa[2]; + a3 = aa[3]; + a4 = aa[4]; + a5 = aa[5]; + a6 = aa[6]; + a7 = aa[7]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + c4 -= a4 * b0; + c5 -= a5 * b0; + c6 -= a6 * b0; + c7 -= a7 * b0; + + aa += 8; + bb += 1; + } + } + + a -= 64; + b -= 8; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + + c6 -= c7 * a62; + c6 *= a54; + + c5 -= c7 * a61; + c5 -= c6 * a53; + c5 *= a45; + + c4 -= c7 * a60; + c4 -= c6 * a52; + c4 -= c5 * a44; + c4 *= a36; + + c3 -= c7 * a59; + c3 -= c6 * a51; + c3 -= c5 * a43; + c3 -= c4 * a35; + c3 *= a27; + + c2 -= c7 * a58; + c2 -= c6 * a50; + c2 -= c5 * a42; + c2 -= c4 * a34; + c2 -= c3 * a26; + c2 *= a18; + + c1 -= c7 * a57; + c1 -= c6 * a49; + c1 -= c5 * a41; + c1 -= c4 * a33; + c1 -= c3 * a25; + c1 -= c2 * a17; + c1 *= a9; + + c0 -= c7 * a56; + c0 -= c6 * a48; + c0 -= c5 * a40; + c0 -= c4 * a32; + c0 -= c3 * a24; + c0 -= c2 * a16; + c0 -= c1 * a8; + c0 *= a0; + + *(b + 7) = c7; + *(b + 6) = c6; + *(b + 5) = c5; + *(b + 4) = c4; + *(b + 3) = c3; + *(b + 2) = c2; + *(b + 1) = c1; + *(b + 0) = c0; + + *(c + 7) = c7; + *(c + 6) = c6; + *(c + 5) = c5; + *(c + 4) = c4; + *(c + 3) = c3; + *(c + 2) = c2; + *(c + 1) = c1; + *(c + 0) = c0; +} + +static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; + v2f64 src_a14, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 16; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + LD_DP2(bb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + aa += 4; + bb += 4; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + + src_a9 = LD_DP(a + 9); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + + src_a8 = __msa_cast_to_vector_double(*(a + 8)); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + + res_c3 *= src_a15; + res_c7 *= src_a15; + + res_c2 -= res_c3 * src_a14; + res_c6 -= res_c7 * src_a14; + res_c2 *= src_a10; + res_c6 *= src_a10; + + res_c1 -= res_c3 * src_a13; + res_c5 -= res_c7 * src_a13; + res_c1 -= res_c2 * src_a9; + res_c5 -= res_c6 * src_a9; + res_c1 *= src_a5; + res_c5 *= src_a5; + + res_c0 -= res_c3 * src_a12; + res_c4 -= res_c7 * src_a12; + res_c0 -= res_c2 * src_a8; + res_c4 -= res_c6 * src_a8; + res_c0 -= res_c1 * src_a4; + res_c4 -= res_c5 * src_a4; + res_c0 *= src_a0; + res_c4 *= src_a0; + + ST_DP(res_c7, b + 14); + ST_DP(res_c3, b + 12); + ST_DP(res_c6, b + 10); + ST_DP(res_c2, b + 8); + ST_DP(res_c5, b + 6); + ST_DP(res_c1, b + 4); + ST_DP(res_c4, b + 2); + ST_DP(res_c0, b + 0); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); +} + +static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; + v2f64 src_a14, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 8; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + src_b0 = LD_DP(bb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + aa += 4; + bb += 2; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + + src_a9 = LD_DP(a + 9); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + + src_a8 = __msa_cast_to_vector_double(*(a + 8)); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + + res_c3 *= src_a15; + + res_c2 -= res_c3 * src_a14; + res_c2 *= src_a10; + + res_c1 -= res_c3 * src_a13; + res_c1 -= res_c2 * src_a9; + res_c1 *= src_a5; + + res_c0 -= res_c3 * src_a12; + res_c0 -= res_c2 * src_a8; + res_c0 -= res_c1 * src_a4; + res_c0 *= src_a0; + + ST_DP(res_c3, b + 6); + ST_DP(res_c2, b + 4); + ST_DP(res_c1, b + 2); + ST_DP(res_c0, b + 0); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); +} + +static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 4; + FLOAT a0, a1, a2, a3, b0; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + a2 = aa[2]; + a3 = aa[3]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + + aa += 4; + bb += 1; + } + } + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + + c2 -= c3 * a14; + c2 *= a10; + + c1 -= c3 * a13; + c1 -= c2 * a9; + c1 *= a5; + + c0 -= c3 * a12; + c0 -= c2 * a8; + c0 -= c1 * a4; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 4, *bb = b + 8; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = bb[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = bb[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = bb[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + aa += 2; + bb += 4; + } + } + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c0 -= c1 * a2; + c0 *= a0; + + c1_nxt1 *= a3; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt1 *= a0; + + c1_nxt2 *= a3; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt2 *= a0; + + c1_nxt3 *= a3; + c0_nxt3 -= c1_nxt3 * a2; + c0_nxt3 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 4, *bb = b + 4; + FLOAT a0, a1, b0, b1; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = bb[1]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + + aa += 2; + bb += 2; + } + } + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + + c0 -= c1 * a2; + c0 *= a0; + + c1_nxt *= a3; + + c0_nxt -= c1_nxt * a2; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0; + FLOAT *aa = a + 4, *bb = b + 2; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + aa += 2; + bb += 1; + } + } + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c0 -= c1 * a2; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + a0 = *a; + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 1, *bb = b + 4; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + } + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; +} + +static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + *c *= *a; + *(c + ldc) = *a * *(c + ldc); + + *b = *c; + *(b + 1) = *(c + ldc); +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG kk, i, j; + FLOAT *aa, *bb, *cc; + + for (j = (n >> 2); j--;) + { + kk = m; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + bb = b + 4 * kk; + cc = c + (m - 1); + + dsolve_1x4_ln_msa(aa - 1, bb - 4, cc, ldc, k - kk); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k + 2 * kk; + bb = b + 4 * kk; + cc = c + ((m & -2) - 2); + + dsolve_2x4_ln_msa(aa - 4, bb - 8, cc, ldc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k + 4 * kk; + bb = b + 4 * kk; + cc = c + ((m & -4) - 4); + + dsolve_4x4_ln_msa(aa - 16, bb - 16, cc, ldc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + kk = m; + + if (m & 7) + { + if (m & 1) + { + aa = a + ((m & -1) - 1) * k; + cc = c + ((m & -1) - 1); + + dsolve_1x2_ln_msa(aa + kk - 1, b + kk * 2 - 2, cc, ldc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k; + cc = c + ((m & -2) - 2); + + dsolve_2x2_ln_msa(aa + kk * 2 - 4, b + kk * 2 - 4, cc, ldc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k; + cc = c + ((m & -4) - 4); + + dsolve_4x2_ln_msa(aa + kk * 4 - 16, b + kk * 2 - 8, cc, ldc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = m; + + if (m & 7) + { + if (m & 1) + { + kk -= 1; + aa = a + ((m & -1) - 1) * k + kk; + cc = c + ((m & -1) - 1); + + *cc *= *aa; + *(b + kk) = *cc; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k + kk * 2; + cc = c + ((m & -2) - 2); + + dsolve_2x1_ln_msa(aa - 4, b + kk - 2, cc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k; + cc = c + ((m & -4) - 4); + + dsolve_4x1_ln_msa(aa + 4 * kk - 16, b + kk - 4, cc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c new file mode 100644 index 000000000..da35aa8f9 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -0,0 +1,1397 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(b, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + a += 8; + b += 4; + + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(b, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + a += 8; + b += 4; + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); + res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); + res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); + res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); + res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); + res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); + res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); + res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + res_c8 *= src_a0; + res_c9 -= res_c8 * src_a1; + res_c10 -= res_c8 * src_a2; + res_c11 -= res_c8 * src_a3; + res_c12 -= res_c8 * src_a4; + res_c13 -= res_c8 * src_a5; + res_c14 -= res_c8 * src_a6; + res_c15 -= res_c8 * src_a7; + + src_a9 = __msa_cast_to_vector_double(*(a + 9)); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + res_c9 *= src_a9; + res_c10 -= res_c9 * src_a10; + res_c11 -= res_c9 * src_a11; + res_c12 -= res_c9 * src_a12; + res_c13 -= res_c9 * src_a13; + res_c14 -= res_c9 * src_a14; + res_c15 -= res_c9 * src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c8, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c9, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); + src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + + ST_DP(src_c0, c); + ST_DP(src_c4, c_nxt1line); + ST_DP(src_c8, c_nxt2line); + ST_DP(src_c12, c_nxt3line); + + src_a18 = LD_DP(a + 18); + src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a20 = LD_DP(a + 20); + src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); + src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); + src_a22 = LD_DP(a + 22); + src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); + src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + res_c10 *= src_a18; + res_c11 -= res_c10 * src_a19; + res_c12 -= res_c10 * src_a20; + res_c13 -= res_c10 * src_a21; + res_c14 -= res_c10 * src_a22; + res_c15 -= res_c10 * src_a23; + + src_a27 = __msa_cast_to_vector_double(*(a + 27)); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a28 = LD_DP(a + 28); + src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); + src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); + src_a30 = LD_DP(a + 30); + src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); + src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + res_c11 *= src_a27; + res_c12 -= res_c11 * src_a28; + res_c13 -= res_c11 * src_a29; + res_c14 -= res_c11 * src_a30; + res_c15 -= res_c11 * src_a31; + + ST_DP(res_c2, b + 8); + ST_DP(res_c10, b + 10); + ST_DP(res_c3, b + 12); + ST_DP(res_c11, b + 14); + + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); + src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + + src_a36 = LD_DP(a + 36); + src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a38 = LD_DP(a + 38); + src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); + src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + res_c12 *= src_a36; + res_c13 -= res_c12 * src_a37; + res_c14 -= res_c12 * src_a38; + res_c15 -= res_c12 * src_a39; + + src_a45 = __msa_cast_to_vector_double(*(a + 45)); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a46 = LD_DP(a + 46); + src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); + src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + res_c13 *= src_a45; + res_c14 -= res_c13 * src_a46; + res_c15 -= res_c13 * src_a47; + + ST_DP(src_c1, c + 2); + ST_DP(src_c5, c_nxt1line + 2); + ST_DP(src_c9, c_nxt2line + 2); + ST_DP(src_c13, c_nxt3line + 2); + + ST_DP(res_c4, b + 16); + ST_DP(res_c12, b + 18); + ST_DP(res_c5, b + 20); + ST_DP(res_c13, b + 22); + + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); + src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + + src_a63 = __msa_cast_to_vector_double(*(a + 63)); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a54 = LD_DP(a + 54); + src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + + res_c14 *= src_a54; + res_c15 -= res_c14 * src_a55; + + res_c7 *= src_a63; + res_c15 *= src_a63; + + ST_DP(src_c2, c + 4); + ST_DP(src_c6, c_nxt1line + 4); + ST_DP(src_c10, c_nxt2line + 4); + ST_DP(src_c14, c_nxt3line + 4); + + ST_DP(res_c6, b + 24); + ST_DP(res_c14, b + 26); + ST_DP(res_c7, b + 28); + ST_DP(res_c15, b + 30); + + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); + src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + + ST_DP(src_c3, c + 6); + ST_DP(src_c7, c_nxt1line + 6); + ST_DP(src_c11, c_nxt2line + 6); + ST_DP(src_c15, c_nxt3line + 6); +} + +static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + a += 8; + b += 2; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + src_a9 = __msa_cast_to_vector_double(*(a + 9)); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + src_a18 = LD_DP(a + 18); + src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a20 = LD_DP(a + 20); + src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); + src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); + src_a22 = LD_DP(a + 22); + src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); + src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + src_a27 = __msa_cast_to_vector_double(*(a + 27)); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a28 = LD_DP(a + 28); + src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); + src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); + src_a30 = LD_DP(a + 30); + src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); + src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + ST_DP(res_c0, b + 0); + ST_DP(res_c1, b + 2); + ST_DP(res_c2, b + 4); + ST_DP(res_c3, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c4, src_c5, c + ldc, 2); + + src_a36 = LD_DP(a + 36); + src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a38 = LD_DP(a + 38); + src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); + src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + src_a45 = __msa_cast_to_vector_double(*(a + 45)); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a46 = LD_DP(a + 46); + src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); + src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + src_a63 = __msa_cast_to_vector_double(*(a + 63)); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a54 = LD_DP(a + 54); + src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + + res_c7 *= src_a63; + + ST_DP(res_c4, b + 8); + ST_DP(res_c5, b + 10); + ST_DP(res_c6, b + 12); + ST_DP(res_c7, b + 14); + + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP2(src_c2, src_c3, c + 4, 2); + ST_DP2(src_c6, src_c7, c + 4 + ldc, 2); +} + +static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + int i; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; + + for (i = bk; i--; ) + { + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + c4 -= a4 * b0; + c5 -= a5 * b0; + c6 -= a6 * b0; + c7 -= a7 * b0; + + a += 8; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a9; + + c2 -= c0 * a2; + c2 -= c1 * a10; + c2 *= a18; + + c3 -= c0 * a3; + c3 -= c1 * a11; + c3 -= c2 * a19; + c3 *= a27; + + c4 -= c0 * a4; + c4 -= c1 * a12; + c4 -= c2 * a20; + c4 -= c3 * a28; + c4 *= a36; + + c5 -= c0 * a5; + c5 -= c1 * a13; + c5 -= c2 * a21; + c5 -= c3 * a29; + c5 -= c4 * a37; + c5 *= a45; + + c6 -= c0 * a6; + c6 -= c1 * a14; + c6 -= c2 * a22; + c6 -= c3 * a30; + c6 -= c4 * a38; + c6 -= c5 * a46; + c6 *= a54; + + c7 -= c0 * a7; + c7 -= c1 * a15; + c7 -= c2 * a23; + c7 -= c3 * a31; + c7 -= c4 * a39; + c7 -= c5 * a47; + c7 -= c6 * a55; + c7 *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; +} + +static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v2f64 src_a10, src_a11, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + LD_DP2(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + a += 4; + b += 4; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + res_c4 *= src_a0; + res_c5 -= res_c4 * src_a1; + res_c6 -= res_c4 * src_a2; + res_c7 -= res_c4 * src_a3; + + src_a5 = __msa_cast_to_vector_double(*(a + 5)); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + res_c5 *= src_a5; + res_c6 -= res_c5 * src_a6; + res_c7 -= res_c5 * src_a7; + + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a15 = __msa_cast_to_vector_double(*(a + 15)); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c3 *= src_a15; + + res_c6 *= src_a10; + res_c7 -= res_c6 * src_a11; + res_c7 *= src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c4, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c5, b + 6); + ST_DP(res_c2, b + 8); + ST_DP(res_c6, b + 10); + ST_DP(res_c3, b + 12); + ST_DP(res_c7, b + 14); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); +} + +static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v2f64 src_a10, src_a11, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + a += 4; + b += 2; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + src_a5 = __msa_cast_to_vector_double(*(a + 5)); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a15 = __msa_cast_to_vector_double(*(a + 15)); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c3 *= src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c1, b + 2); + ST_DP(res_c2, b + 4); + ST_DP(res_c3, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); +} + +static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, a2, a3, b0; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + + a += 4; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a5; + + c2 -= c0 * a2; + c2 -= c1 * a6; + c2 *= a10; + + c3 -= c0 * a3; + c3 -= c1 * a7; + c3 -= c2 * a11; + c3 *= a15; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3; + FLOAT c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = b[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = b[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + a += 2; + b += 4; + } + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + c0_nxt1 *= a0; + c1_nxt1 -= c0_nxt1 * a1; + c1_nxt1 *= a3; + + c0_nxt2 *= a0; + c1_nxt2 -= c0_nxt2 * a1; + c1_nxt2 *= a3; + + c0_nxt3 *= a0; + c1_nxt3 -= c0_nxt3 * a1; + c1_nxt3 *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3; + FLOAT c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + + a += 2; + b += 2; + } + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + c0_nxt *= a0; + c1_nxt -= c0_nxt * a1; + c1_nxt *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + a += 2; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c0_nxt3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + } + + a0 = *a; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; +} + +static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c0_nxt; + + c0 = *c; + c0_nxt = *(c + ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= *a * b[0]; + c0_nxt -= *a * b[1]; + + a += 1; + b += 2; + } + } + + c0 *= *a; + c0_nxt *= *a; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + + *(c + 0) = c0; + *(c + ldc) = c0_nxt; +} + +static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + *c -= *a * *b; + + a += 1; + b += 1; + } + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc; + + for (j = (n >> 2); j--;) + { + kk = 0; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dsolve_1x4_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + kk = 0; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dsolve_1x2_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = 0; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_lt_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_lt_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x1_lt_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dgmm_dsolve_1x1_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c new file mode 100644 index 000000000..659f77266 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -0,0 +1,963 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v2f64 src_b10, src_b11, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_b; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(b, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + a += 8; + b += 4; + + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(b, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + a += 8; + b += 4; + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + src_b5 = __msa_cast_to_vector_double(*(b + 5)); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b6 = LD_DP(b + 6); + src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); + src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); + src_b10 = LD_DP(b + 10); + src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b15 = __msa_cast_to_vector_double(*(b + 15)); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + src_c4 -= src_c0 * src_b1; + src_c5 -= src_c1 * src_b1; + src_c6 -= src_c2 * src_b1; + src_c7 -= src_c3 * src_b1; + + src_c4 *= src_b5; + src_c5 *= src_b5; + src_c6 *= src_b5; + src_c7 *= src_b5; + + src_c8 -= src_c0 * src_b2; + src_c9 -= src_c1 * src_b2; + src_c10 -= src_c2 * src_b2; + src_c11 -= src_c3 * src_b2; + + src_c8 -= src_c4 * src_b6; + src_c9 -= src_c5 * src_b6; + src_c10 -= src_c6 * src_b6; + src_c11 -= src_c7 * src_b6; + + src_c8 *= src_b10; + src_c9 *= src_b10; + src_c10 *= src_b10; + src_c11 *= src_b10; + + src_c12 -= src_c0 * src_b3; + src_c13 -= src_c1 * src_b3; + src_c14 -= src_c2 * src_b3; + src_c15 -= src_c3 * src_b3; + + src_c12 -= src_c4 * src_b7; + src_c13 -= src_c5 * src_b7; + src_c14 -= src_c6 * src_b7; + src_c15 -= src_c7 * src_b7; + + src_c12 -= src_c8 * src_b11; + src_c13 -= src_c9 * src_b11; + src_c14 -= src_c10 * src_b11; + src_c15 -= src_c11 * src_b11; + + src_c12 *= src_b15; + src_c13 *= src_b15; + src_c14 *= src_b15; + src_c15 *= src_b15; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); +} + +static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b1, src_b3; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + a += 8; + b += 2; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b3 = __msa_cast_to_vector_double(*(b + 3)); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + src_c4 -= src_c0 * src_b1; + src_c5 -= src_c1 * src_b1; + src_c6 -= src_c2 * src_b1; + src_c7 -= src_c3 * src_b1; + + src_c4 *= src_b3; + src_c5 *= src_b3; + src_c6 *= src_b3; + src_c7 *= src_b3; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3; + v2f64 src_b0; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b; + + for (i = bk; i--;) + { + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b, (v2i64) src_b); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + a += 8; + b += 1; + } + } + + src_b0 = __msa_cast_to_vector_double(*b); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v2f64 src_b10, src_b11, src_b15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + LD_DP2(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + a += 4; + b += 4; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + src_b5 = __msa_cast_to_vector_double(*(b + 5)); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b6 = LD_DP(b + 6); + src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); + src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); + src_b10 = LD_DP(b + 10); + src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b15 = __msa_cast_to_vector_double(*(b + 15)); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + + src_c2 *= src_b5; + src_c3 *= src_b5; + + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + + src_c4 -= src_c2 * src_b6; + src_c5 -= src_c3 * src_b6; + + src_c4 *= src_b10; + src_c5 *= src_b10; + + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + + src_c6 -= src_c2 * src_b7; + src_c7 -= src_c3 * src_b7; + + src_c6 -= src_c4 * src_b11; + src_c7 -= src_c5 * src_b11; + + src_c6 *= src_b15; + src_c7 *= src_b15; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + a += 4; + b += 2; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b3 = __msa_cast_to_vector_double(*(b + 3)); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + + src_c2 *= src_b3; + src_c3 *= src_b3; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, a2, a3; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + + a += 4; + b += 1; + } + } + + b0 = *b; + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT c1, c1_nxt1, c1_nxt2, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = b[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = b[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + a += 2; + b += 4; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c1_nxt2 -= c1_nxt1 * b6; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c1_nxt3 -= c1_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c1_nxt3 -= c1_nxt2 * b11; + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; + + *(c + 1) = c1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + + a += 2; + b += 2; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + c1 *= b0; + + c0_nxt -= c0 * b1; + c1_nxt -= c1 * b1; + + c0_nxt *= b3; + c1_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + a += 2; + b += 1; + } + } + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c0_nxt3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + + c0_nxt1 -= c0 * b1; + c0_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c0_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c0_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c0_nxt1; + *(a + 2) = c0_nxt2; + *(a + 3) = c0_nxt3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; +} + +static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c0_nxt; + + c0 = *c; + c0_nxt = *(c + ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= *a * b[0]; + c0_nxt -= *a * b[1]; + + a += 1; + b += 2; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + + c0_nxt -= c0 * b1; + c0_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c0_nxt; + + *(c + 0) = c0; + *(c + ldc) = c0_nxt; +} + +static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + *c -= *a * *b; + + a += 1; + b += 1; + } + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc; + + kk = 0; + + for (j = (n >> 2); j--;) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x4_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + kk += 4; + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x2_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + kk += 2; + } + + if (n & 1) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_rn_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_rn_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x1_rn_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dgmm_dsolve_1x1_msa(b, aa, cc, kk); + + aa += k; + cc += 1; + } + } + + b += k; + c += ldc; + kk += 1; + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c new file mode 100644 index 000000000..a90d5fec3 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -0,0 +1,866 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v2f64 src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pbb, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + pba += 8; + pbb += 4; + + LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(pbb, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } + + a -= 32; + b -= 16; + + src_b12 = LD_DP(b + 12); + src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); + src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); + src_b14 = LD_DP(b + 14); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); + src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); + + src_b8 = LD_DP(b + 8); + src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); + src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); + src_b10 = __msa_cast_to_vector_double(*(b + 10)); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b4 = LD_DP(b + 4); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); + src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); + + src_c12 *= src_b15; + src_c13 *= src_b15; + src_c14 *= src_b15; + src_c15 *= src_b15; + + src_c8 -= src_c12 * src_b14; + src_c9 -= src_c13 * src_b14; + src_c10 -= src_c14 * src_b14; + src_c11 -= src_c15 * src_b14; + + src_c8 *= src_b10; + src_c9 *= src_b10; + src_c10 *= src_b10; + src_c11 *= src_b10; + + src_c4 -= src_c12 * src_b13; + src_c5 -= src_c13 * src_b13; + src_c6 -= src_c14 * src_b13; + src_c7 -= src_c15 * src_b13; + + src_c4 -= src_c8 * src_b9; + src_c5 -= src_c9 * src_b9; + src_c6 -= src_c10 * src_b9; + src_c7 -= src_c11 * src_b9; + + src_c4 *= src_b5; + src_c5 *= src_b5; + src_c6 *= src_b5; + src_c7 *= src_b5; + + src_c0 -= src_c12 * src_b12; + src_c1 -= src_c13 * src_b12; + src_c2 -= src_c14 * src_b12; + src_c3 -= src_c15 * src_b12; + + src_c0 -= src_c8 * src_b8; + src_c1 -= src_c9 * src_b8; + src_c2 -= src_c10 * src_b8; + src_c3 -= src_c11 * src_b8; + + src_c0 -= src_c4 * src_b4; + src_c1 -= src_c5 * src_b4; + src_c2 -= src_c6 * src_b4; + src_c3 -= src_c7 * src_b4; + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b2, src_b3; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + + LD_DP4(a + 16, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b + 4); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + + src_c4 *= src_b3; + src_c5 *= src_b3; + src_c6 *= src_b3; + src_c7 *= src_b3; + + src_c0 -= src_c4 * src_b2; + src_c1 -= src_c5 * src_b2; + src_c2 -= src_c6 * src_b2; + src_c3 -= src_c7 * src_b2; + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + v2f64 src_c0, src_c1, src_c2, src_c3; + v2f64 src_b0; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + + src_b0 = __msa_cast_to_vector_double(*b); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v2f64 src_b14, src_b15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 16; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + LD_DP2(bb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + aa += 4; + bb += 4; + } + } + + src_b12 = LD_DP(b + 12); + src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); + src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); + src_b14 = LD_DP(b + 14); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); + src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); + + src_b8 = LD_DP(b + 8); + src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); + src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); + src_b10 = __msa_cast_to_vector_double(*(b + 10)); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b4 = LD_DP(b + 4); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); + src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); + + src_c6 *= src_b15; + src_c7 *= src_b15; + + src_c4 -= src_c6 * src_b14; + src_c5 -= src_c7 * src_b14; + + src_c4 *= src_b10; + src_c5 *= src_b10; + + src_c2 -= src_c6 * src_b13; + src_c3 -= src_c7 * src_b13; + + src_c2 -= src_c4 * src_b9; + src_c3 -= src_c5 * src_b9; + + src_c2 *= src_b5; + src_c3 *= src_b5; + + src_c0 -= src_c6 * src_b12; + src_c1 -= src_c7 * src_b12; + + src_c0 -= src_c4 * src_b8; + src_c1 -= src_c5 * src_b8; + + src_c0 -= src_c2 * src_b4; + src_c1 -= src_c3 * src_b4; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c0, src_c1, c, 2); + + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + v2f64 src_a0, src_a1, src_b, src_b0; + + LD_DP2(a + 8, 2, src_a0, src_a1); + src_b0 = LD_DP(b + 4); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + } + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + + src_c2 *= src_b3; + src_c3 *= src_b3; + + src_c0 -= src_c2 * src_b2; + src_c1 -= src_c3 * src_b2; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + FLOAT b0, c0, c1, c2, c3; + + b0 = *(b + 0); + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 8, *bb = b + 16; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = bb[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = bb[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = bb[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + aa += 2; + bb += 4; + } + } + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + c0_nxt2 -= c0_nxt3 * b14; + c1_nxt2 -= c1_nxt3 * b14; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt1 -= c0_nxt3 * b13; + c1_nxt1 -= c1_nxt3 * b13; + c0_nxt1 -= c0_nxt2 * b9; + c1_nxt1 -= c1_nxt2 * b9; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0 -= c0_nxt3 * b12; + c1 -= c1_nxt3 * b12; + c0 -= c0_nxt2 * b8; + c1 -= c1_nxt2 * b8; + c0 -= c0_nxt1 * b4; + c1 -= c1_nxt1 * b4; + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +{ + FLOAT b0, b2, b3; + FLOAT c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + FLOAT a0, a1, b0, b1; + + a0 = a[4]; + a1 = a[5]; + + b0 = b[4]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[5]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + } + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + c1_nxt *= b3; + + c0 -= c0_nxt * b2; + c0 *= b0; + + c1 -= c1_nxt * b2; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 4, *bb = b + 16; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + } + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + + c0_nxt2 -= c0_nxt3 * b14; + c0_nxt2 *= b10; + + c0_nxt1 -= c0_nxt3 * b13; + c0_nxt1 -= c0_nxt2 * b9; + c0_nxt1 *= b5; + + c0 -= c0_nxt3 * b12; + c0 -= c0_nxt2 * b8; + c0 -= c0_nxt1 * b4; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c0_nxt1; + *(a + 2) = c0_nxt2; + *(a + 3) = c0_nxt3; + + *(c) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; +} + +static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c0_nxt; + + c0 = *(c + 0); + c0_nxt = *(c + ldc); + + if (bk > 0) + { + c0 -= a[2] * b[4]; + c0_nxt -= a[2] * b[5]; + } + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + + c0 -= c0_nxt * b2; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c0_nxt; + + *(c + 0) = c0; + *(c + ldc) = c0_nxt; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc, *bb; + + kk = n; + c += n * ldc; + b += n * k; + + if (n & 3) + { + if (n & 1) + { + aa = a; + c -= ldc; + b -= k; + bb = b + (kk - 1); + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_rt_msa(aa + 8 * kk - 8, bb, cc); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_rt_msa(aa + 4 * kk - 4, bb, cc); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x1_rt_msa(aa + 2 * kk - 2, bb, cc); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + *cc *= *bb; + *(aa + kk - 1) = *cc; + + aa += k; + cc += 1; + } + + } + + kk -= 1; + } + + if (n & 2) + { + aa = a; + c -= 2 * ldc; + b -= 2 * k; + bb = b + 2 * kk; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_rt_msa(aa + 8 * kk - 16, bb - 4, cc, ldc, k - kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_rt_msa(aa + 4 * kk - 8, bb - 4, cc, ldc, k - kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x2_rt_msa(aa + 2 * kk - 4, bb - 4, cc, ldc, k - kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x2_rt_msa(aa + kk - 2, bb - 4, cc, ldc, k - kk); + } + } + + kk -= 2; + } + } + + for (j = (n >> 2); j--;) + { + aa = a; + b -= 4 * k; + bb = b + 4 * kk; + c -= 4 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_rt_msa(aa + kk * 4 - 16, bb - 16, cc, ldc, k - kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x4_rt_msa(aa + kk * 2 - 8, bb - 16, cc, ldc, k - kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x4_rt_msa(aa + kk - 4, bb - 16, cc, ldc, k - kk); + + aa += k; + cc += 1; + } + } + + kk -= 4; + } + + return 0; +} From 8310d4d3f7e5258965991e9df252fab654d7d368 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 16 May 2016 14:14:25 +0200 Subject: [PATCH 30/70] optimized dgemm for 20 threads --- Makefile.power | 4 +- common_power.h | 2 +- kernel/power/dgemm_logic_16x4_power8.S | 158 ++++++++++++++++++++++-- kernel/power/dgemm_macros_16x4_power8.S | 74 ++++++----- param.h | 6 +- 5 files changed, 191 insertions(+), 53 deletions(-) diff --git a/Makefile.power b/Makefile.power index 48bcb77f8..589d67441 100644 --- a/Makefile.power +++ b/Makefile.power @@ -13,10 +13,10 @@ endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math endif endif diff --git a/common_power.h b/common_power.h index b62aca303..e3a1a7aef 100644 --- a/common_power.h +++ b/common_power.h @@ -803,7 +803,7 @@ Lmcount$lazy_ptr: #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) #elif defined(POWER8) -#define BUFFER_SIZE ( 32 << 20) +#define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) #endif diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 718f80bdd..edfcc4bcc 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDGEMM_L4_BEGIN: - mr CO, C + li T1, 128 + li T2, 256 mr AO, A - slwi T1, LDC , 2 - add C, C, T1 + + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 + srawi. I, M, 4 ble LDGEMM_L4x16_END + .align 4 +LDGEMM_L4x16_BEGIN_FIRST: + + li L, -128 + + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + mr BO, B + srawi. L, K, 2 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + ble LDGEMM_L4x16_SUB0_FIRST + cmpwi cr0, L, 1 + ble LDGEMM_L4x16_SUB4_FIRST + + .align 4 +LDGEMM_L4x16_LOOP_START_FIRST: + + li T2, 512 + li o40, 40 + li o56, 56 + + dcbt AO, PRE + dcbt BO, T2 + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + addic. L, L, -2 + KERNEL4x16_L2 + + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + dcbt BO, T2 + KERNEL4x16_L2 + + ble LDGEMM_L4x16_LOOP_END_FIRST + mtctr L + + .align 4 + +LDGEMM_L4x16_LOOP_FIRST: + + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + KERNEL4x16_L2 + + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + dcbt BO, T2 + KERNEL4x16_L2 + + bdnz LDGEMM_L4x16_LOOP_FIRST + + .align 4 + +LDGEMM_L4x16_LOOP_END_FIRST: + + KERNEL4x16_L1 + KERNEL4x16_L2 + + KERNEL4x16_1 + KERNEL4x16_E2 + + b LDGEMM_L4x16_SUB1_FIRST + +LDGEMM_L4x16_SUB4_FIRST: + + KERNEL4x16_SUBI1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b LDGEMM_L4x16_SUB1_FIRST + +LDGEMM_L4x16_SUB0_FIRST: + + andi. L, K, 3 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE_FIRST + b LDGEMM_L4x16_SUB2_FIRST + +LDGEMM_L4x16_SUB1_FIRST: + + andi. L, K, 3 + ble LDGEMM_L4x16_SAVE_FIRST + +LDGEMM_L4x16_SUB2_FIRST: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x16_SUB2_FIRST + + .align 4 +LDGEMM_L4x16_SAVE_FIRST: + + SAVE4x16 + + addic. I, I, -1 + ble LDGEMM_L4x16_END + +LDGEMM_L4x16_END_FIRST: + .align 4 LDGEMM_L4x16_BEGIN: @@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN: dcbt T3, r0 dcbt T4, r0 - ble LDGEMM_L4x16_SUB0 + ble- LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble LDGEMM_L4x16_SUB4 + ble- LDGEMM_L4x16_SUB4 .align 4 LDGEMM_L4x16_LOOP_START: @@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START: addic. L, L, -2 KERNEL4x16_L2 - ble LDGEMM_L4x16_LOOP_END + ble- LDGEMM_L4x16_LOOP_END + mtctr L .align 4 @@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE - addic. L, L, -1 + // addic. L, L, -1 KERNEL4x16_L2 - bgt LDGEMM_L4x16_LOOP + bdnz+ LDGEMM_L4x16_LOOP .align 4 @@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt LDGEMM_L4x16_BEGIN + bgt+ LDGEMM_L4x16_BEGIN LDGEMM_L4x16_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 2c7851207..5be517f7c 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x16 - mr T1, CO - add T2, T1, LDC - add T3, T2, LDC - add T4, T3, LDC + add T2, CO, LDC lxvd2x vs0, 0, CO lxvd2x vs1, o16, CO @@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs3, o48, CO lxvd2x vs4, o64, CO lxvd2x vs5, o80, CO + add T3, T2, LDC lxvd2x vs6, o96, CO lxvd2x vs7, o112, CO @@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs11, o48, T2 lxvd2x vs12, o64, T2 lxvd2x vs13, o80, T2 + add T4, T3, LDC lxvd2x vs14, o96, T2 lxvd2x vs15, o112, T2 @@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs31, o112, T3 xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r - lxvd2x vs32, 0, T4 + xvmaddadp vs1, vs33, alpha_r lxvd2x vs33, o16, T4 + xvmaddadp vs2, vs34, alpha_r lxvd2x vs34, o32, T4 + xvmaddadp vs3, vs35, alpha_r lxvd2x vs35, o48, T4 + xvmaddadp vs4, vs36, alpha_r lxvd2x vs36, o64, T4 + xvmaddadp vs5, vs37, alpha_r lxvd2x vs37, o80, T4 + xvmaddadp vs6, vs38, alpha_r lxvd2x vs38, o96, T4 + xvmaddadp vs7, vs39, alpha_r lxvd2x vs39, o112, T4 xvmaddadp vs8, vs40, alpha_r @@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r - stxvd2x vs4, o64, T1 - stxvd2x vs5, o80, T1 - stxvd2x vs6, o96, T1 - stxvd2x vs7, o112, T1 - xvmaddadp vs24, vs48, alpha_r xvmaddadp vs25, vs49, alpha_r xvmaddadp vs26, vs50, alpha_r xvmaddadp vs27, vs51, alpha_r - stxvd2x vs8, o0, T2 - stxvd2x vs9, o16, T2 - stxvd2x vs10, o32, T2 - stxvd2x vs11, o48, T2 - xvmaddadp vs28, vs52, alpha_r xvmaddadp vs29, vs53, alpha_r xvmaddadp vs30, vs54, alpha_r xvmaddadp vs31, vs55, alpha_r - stxvd2x vs12, o64, T2 - stxvd2x vs13, o80, T2 - stxvd2x vs14, o96, T2 - stxvd2x vs15, o112, T2 + stxvd2x vs0, 0, CO + stxvd2x vs1, o16, CO + stxvd2x vs2, o32, CO + stxvd2x vs3, o48, CO + + stxvd2x vs4, o64, CO + stxvd2x vs5, o80, CO + stxvd2x vs6, o96, CO + stxvd2x vs7, o112, CO xvmaddadp vs32, vs56, alpha_r xvmaddadp vs33, vs57, alpha_r xvmaddadp vs34, vs58, alpha_r xvmaddadp vs35, vs59, alpha_r - stxvd2x vs24, 0, T3 - stxvd2x vs25, o16, T3 - stxvd2x vs26, o32, T3 - stxvd2x vs27, o48, T3 - xvmaddadp vs36, vs60, alpha_r xvmaddadp vs37, vs61, alpha_r xvmaddadp vs38, vs62, alpha_r xvmaddadp vs39, vs63, alpha_r + addi CO, CO, 128 + + stxvd2x vs8, o0, T2 + stxvd2x vs9, o16, T2 + stxvd2x vs10, o32, T2 + stxvd2x vs11, o48, T2 + + stxvd2x vs12, o64, T2 + stxvd2x vs13, o80, T2 + stxvd2x vs14, o96, T2 + stxvd2x vs15, o112, T2 + + stxvd2x vs24, 0, T3 + stxvd2x vs25, o16, T3 stxvd2x vs28, o64, T3 stxvd2x vs29, o80, T3 + + stxvd2x vs26, o32, T3 + stxvd2x vs27, o48, T3 stxvd2x vs30, o96, T3 stxvd2x vs31, o112, T3 @@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T4 stxvd2x vs35, o48, T4 - addi CO, CO, 128 - stxvd2x vs36, o64, T4 stxvd2x vs37, o80, T4 stxvd2x vs38, o96, T4 diff --git a/param.h b/param.h index 9046c33d7..489127d2d 100644 --- a/param.h +++ b/param.h @@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 4096 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_P 320 #define SGEMM_DEFAULT_Q 640 -#define DGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640 From 40af513669992ab9677aa9fb99de1d995ec522a6 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Mon, 16 May 2016 13:07:55 +0000 Subject: [PATCH 31/70] Disable multi-threading in swap * Close #873 --- interface/swap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/interface/swap.c b/interface/swap.c index 23b2e4ec8..7d47d600b 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,6 +42,10 @@ #include "functable.h" #endif +// Disable multi-threading as it does not show any performance +// benefits. Keep the multi-threading code for the record. +#undef SMP + #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ From d7cbc7ac13974628f5dff29e45886c5f9aed57b8 Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Tue, 17 May 2016 15:48:02 +0530 Subject: [PATCH 32/70] DTRSM bug fix for MIPS P5600 and I6400 Signed-off-by: Kaustubh Raste --- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 301 +++++++++------------ kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 323 ++++++++++------------- kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 204 +++++++-------- kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 359 ++++++++++++++++++-------- kernel/mips/macros_msa.h | 14 + 5 files changed, 612 insertions(+), 589 deletions(-) diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9f0eb95a5..d0792bf85 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -126,22 +126,14 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 64; b -= 32; - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); - res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); - res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); - res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); - res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); - res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); - res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); - res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); - res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); + ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); + ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); + ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); src_a54 = __msa_cast_to_vector_double(*(a + 54)); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -172,10 +164,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c6, b + 24); ST_DP(res_c15, b + 30); ST_DP(res_c14, b + 26); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); - src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); - src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); ST_DP(src_c3, c + 6); ST_DP(src_c7, c_nxt1line + 6); ST_DP(src_c11, c_nxt2line + 6); @@ -211,10 +201,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c12, b + 18); ST_DP(res_c13, b + 22); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); - src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); ST_DP(src_c2, c + 4); ST_DP(src_c6, c_nxt1line + 4); ST_DP(src_c10, c_nxt2line + 4); @@ -286,10 +274,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); - src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); ST_DP(src_c1, c + 2); ST_DP(src_c5, c_nxt1line + 2); ST_DP(src_c9, c_nxt2line + 2); @@ -343,10 +329,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b + 4); ST_DP(res_c9, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); - src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); ST_DP(src_c0, c); ST_DP(src_c4, c_nxt1line); @@ -417,14 +401,10 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c7 -= src_a3 * src_b; } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); src_a56 = LD_DP(a - 8); src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); @@ -541,14 +521,10 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b - 14); ST_DP(res_c0, b - 16); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); @@ -572,30 +548,19 @@ static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) if (bk > 0) { - int i; + BLASLONG i; FLOAT *aa = a, *bb = b; - FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; for (i = bk; i--; ) { - a0 = aa[0]; - a1 = aa[1]; - a2 = aa[2]; - a3 = aa[3]; - a4 = aa[4]; - a5 = aa[5]; - a6 = aa[6]; - a7 = aa[7]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; - c4 -= a4 * b0; - c5 -= a5 * b0; - c6 -= a6 * b0; - c7 -= a7 * b0; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; aa += 8; bb += 1; @@ -720,7 +685,7 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 16; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) @@ -749,14 +714,13 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + a -= 16; + b -= 16; + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); @@ -813,14 +777,10 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c4, b + 2); ST_DP(res_c0, b + 0); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -840,7 +800,7 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 8; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0; for (i = bk; i--;) @@ -861,10 +821,11 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + a -= 16; + b -= 8; + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); @@ -907,10 +868,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b + 2); ST_DP(res_c0, b + 0); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -918,8 +877,7 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; - FLOAT c0, c1, c2, c3; + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); @@ -929,27 +887,23 @@ static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 4; - FLOAT a0, a1, a2, a3, b0; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - a2 = aa[2]; - a3 = aa[3]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; aa += 4; bb += 1; } } + a -= 16; + b -= 4; + a0 = *(a + 0); a4 = *(a + 4); a5 = *(a + 5); @@ -1003,35 +957,27 @@ static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 4, *bb = b + 8; - FLOAT a0, a1, b0, b1, b2, b3; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = bb[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = bb[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = bb[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } } + a -= 4; + b -= 8; + a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); @@ -1063,13 +1009,10 @@ static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; - *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; - *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } @@ -1087,27 +1030,24 @@ static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 4, *bb = b + 4; - FLOAT a0, a1, b0, b1; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - b1 = bb[1]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; aa += 2; bb += 2; } } + a -= 4; + b -= 4; + a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); @@ -1144,33 +1084,28 @@ static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) if (bk > 0) { BLASLONG i; - FLOAT a0, a1, b0; - FLOAT *aa = a + 4, *bb = b + 2; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; aa += 2; bb += 1; } } - a0 = *(a + 0); - a2 = *(a + 2); - a3 = *(a + 3); + a0 = *(a - 4); + a2 = *(a - 2); + a3 = *(a - 1); c1 *= a3; c0 -= c1 * a2; c0 *= a0; - *(b + 0) = c0; - *(b + 1) = c1; + *(b - 2) = c0; + *(b - 1) = c1; *(c + 0) = c0; *(c + 1) = c1; @@ -1178,46 +1113,44 @@ static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT c0, c1, c2, c3; - a0 = *a; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 1, *bb = b + 4; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; - c0_nxt1 -= aa[0] * bb[1]; - c0_nxt2 -= aa[0] * bb[2]; - c0_nxt3 -= aa[0] * bb[3]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } } - c0 *= a0; - c0_nxt1 *= a0; - c0_nxt2 *= a0; - c0_nxt3 *= a0; + c0 *= *(a - 1); + c1 *= *(a - 1); + c2 *= *(a - 1); + c3 *= *(a - 1); *(c + 0 * ldc) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; - - *(b + 0) = c0; - *(b + 1) = c0_nxt1; - *(b + 2) = c0_nxt2; - *(b + 3) = c0_nxt3; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + + *(b - 4) = c0; + *(b - 3) = c1; + *(b - 2) = c2; + *(b - 1) = c3; } static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) @@ -1247,7 +1180,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + (m - 1); - dsolve_1x4_ln_msa(aa - 1, bb - 4, cc, ldc, k - kk); + dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk); kk -= 1; } @@ -1258,7 +1191,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + ((m & -2) - 2); - dsolve_2x4_ln_msa(aa - 4, bb - 8, cc, ldc, k - kk); + dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk); kk -= 2; } @@ -1269,7 +1202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + ((m & -4) - 4); - dsolve_4x4_ln_msa(aa - 16, bb - 16, cc, ldc, k - kk); + dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk); kk -= 4; } @@ -1319,7 +1252,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -2) - 2) * k; cc = c + ((m & -2) - 2); - dsolve_2x2_ln_msa(aa + kk * 2 - 4, b + kk * 2 - 4, cc, ldc, k - kk); + dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk); kk -= 2; } @@ -1329,7 +1262,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); - dsolve_4x2_ln_msa(aa + kk * 4 - 16, b + kk * 2 - 8, cc, ldc, k - kk); + dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk); kk -= 4; } @@ -1377,7 +1310,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -2) - 2) * k + kk * 2; cc = c + ((m & -2) - 2); - dsolve_2x1_ln_msa(aa - 4, b + kk - 2, cc, k - kk); + dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk); kk -= 2; } @@ -1387,7 +1320,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); - dsolve_4x1_ln_msa(aa + 4 * kk - 16, b + kk - 4, cc, k - kk); + dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk); kk -= 4; } diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index da35aa8f9..db902c0de 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -48,7 +48,7 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_b, src_b0, src_b1, src_b2, src_b3; @@ -124,22 +124,14 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); - res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); - res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); - res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); - res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); - res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); - res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); - res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); - res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); + ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); + ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); + ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -205,10 +197,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b + 4); ST_DP(res_c9, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); - src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); ST_DP(src_c0, c); ST_DP(src_c4, c_nxt1line); @@ -265,10 +255,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c3, b + 12); ST_DP(res_c11, b + 14); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); - src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); src_a36 = LD_DP(a + 36); src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); @@ -311,10 +299,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c5, b + 20); ST_DP(res_c13, b + 22); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); - src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); src_a63 = __msa_cast_to_vector_double(*(a + 63)); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); @@ -341,10 +327,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c7, b + 28); ST_DP(res_c15, b + 30); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); - src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); - src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); ST_DP(src_c3, c + 6); ST_DP(src_c7, c_nxt1line + 6); @@ -365,15 +349,21 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; - v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + v2f64 src_b, src_b0, src_b1; - for (i = bk; i--;) + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + a += 8; + b += 2; + + for (i = (bk - 1); i--;) { - LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); - src_b0 = LD_DP(b); + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -387,19 +377,33 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + a += 8; b += 2; } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -480,10 +484,8 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c2, b + 4); ST_DP(res_c3, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c4, src_c5, c + ldc, 2); @@ -526,10 +528,8 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c6, b + 12); ST_DP(res_c7, b + 14); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ST_DP2(src_c2, src_c3, c + 4, 2); ST_DP2(src_c6, src_c7, c + 4 + ldc, 2); @@ -539,8 +539,7 @@ static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; - FLOAT a45, a46, a47, a54, a55, a63; - FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1); @@ -551,31 +550,20 @@ static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c6 = *(c + 6); c7 = *(c + 7); - if (bk > 0) + if (bk) { - int i; - FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; + BLASLONG i; for (i = bk; i--; ) { - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; - c4 -= a4 * b0; - c5 -= a5 * b0; - c6 -= a6 * b0; - c7 -= a7 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; a += 8; b += 1; @@ -694,7 +682,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; @@ -725,14 +713,10 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -788,15 +772,10 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c3, b + 12); ST_DP(res_c7, b + 14); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -813,7 +792,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0; @@ -836,10 +815,8 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -878,10 +855,8 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c2, b + 4); ST_DP(res_c3, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -889,31 +864,23 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - FLOAT c0, c1, c2, c3; - FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, a2, a3, b0; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; @@ -958,8 +925,7 @@ static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0, a1, a3; - FLOAT c0, c1, c0_nxt1, c1_nxt1; + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); @@ -971,31 +937,20 @@ static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1, b2, b3; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = b[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = b[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = b[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -1033,21 +988,17 @@ static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; - *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; - *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0, a1, a3; - FLOAT c0, c1, c0_nxt, c1_nxt; + FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); @@ -1055,23 +1006,17 @@ static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; - b1 = b[1]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -1109,19 +1054,14 @@ static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -1145,63 +1085,60 @@ static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT c0, c1, c2, c3; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; - c0_nxt1 -= a[0] * b[1]; - c0_nxt2 -= a[0] * b[2]; - c0_nxt3 -= a[0] * b[3]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; a += 1; b += 4; } } - a0 = *a; - - c0 *= a0; - c0_nxt1 *= a0; - c0_nxt2 *= a0; - c0_nxt3 *= a0; + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; *(c + 0 * ldc) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; *(b + 0) = c0; - *(b + 1) = c0_nxt1; - *(b + 2) = c0_nxt2; - *(b + 3) = c0_nxt3; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; } static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT c0, c0_nxt; + FLOAT c0, c1; c0 = *c; - c0_nxt = *(c + ldc); + c1 = *(c + ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= *a * b[0]; - c0_nxt -= *a * b[1]; + c1 -= *a * b[1]; a += 1; b += 2; @@ -1209,18 +1146,18 @@ static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } c0 *= *a; - c0_nxt *= *a; + c1 *= *a; *(b + 0) = c0; - *(b + 1) = c0_nxt; + *(b + 1) = c1; *(c + 0) = c0; - *(c + ldc) = c0_nxt; + *(c + ldc) = c1; } static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk > 0) + if (bk) { BLASLONG i; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index 659f77266..518daad13 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -43,7 +43,7 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; @@ -200,20 +200,26 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; - v2f64 src_b0, src_b1, src_b3; + v2f64 src_b0, src_b1, src_b3, src_b; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; - v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; - for (i = bk; i--;) + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + a += 8; + b += 2; + + for (i = (bk - 1); i--;) { - LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); - src_b0 = LD_DP(b); + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -227,9 +233,27 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + a += 8; b += 2; } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; } src_b0 = LD_DP(b + 0); @@ -267,7 +291,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_a2, src_a3, src_b; @@ -311,7 +335,7 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; @@ -405,7 +429,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0; @@ -451,42 +475,33 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - FLOAT b0, c0, c1, c2, c3; + FLOAT c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, a2, a3; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; } } - b0 = *b; - - c0 *= b0; - c1 *= b0; - c2 *= b0; - c3 *= b0; + c0 *= *b; + c1 *= *b; + c2 *= *b; + c3 *= *b; *(a + 0) = c0; *(a + 1) = c1; @@ -514,31 +529,20 @@ static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1, b2, b3; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = b[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = b[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = b[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -590,13 +594,12 @@ static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(a + 7) = c1_nxt3; *(c + 0) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; - *(c + 1) = c1; + *(c + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } @@ -606,27 +609,20 @@ static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0 = *(c + 0); c1 = *(c + 1); - c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; - b1 = b[1]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -653,7 +649,7 @@ static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + ldc) = c0_nxt; + *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } @@ -664,19 +660,14 @@ static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -697,24 +688,23 @@ static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; - c0_nxt1 -= a[0] * b[1]; - c0_nxt2 -= a[0] * b[2]; - c0_nxt3 -= a[0] * b[3]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; a += 1; b += 4; @@ -734,44 +724,44 @@ static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0 *= b0; - c0_nxt1 -= c0 * b1; - c0_nxt1 *= b5; + c1 -= c0 * b1; + c1 *= b5; - c0_nxt2 -= c0 * b2; - c0_nxt2 -= c0_nxt1 * b6; - c0_nxt2 *= b10; + c2 -= c0 * b2; + c2 -= c1 * b6; + c2 *= b10; - c0_nxt3 -= c0 * b3; - c0_nxt3 -= c0_nxt1 * b7; - c0_nxt3 -= c0_nxt2 * b11; - c0_nxt3 *= b15; + c3 -= c0 * b3; + c3 -= c1 * b7; + c3 -= c2 * b11; + c3 *= b15; *(a + 0) = c0; - *(a + 1) = c0_nxt1; - *(a + 2) = c0_nxt2; - *(a + 3) = c0_nxt3; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; *(c + 0) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; } static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b1, b3, c0, c0_nxt; + FLOAT b0, b1, b3, c0, c1; c0 = *c; - c0_nxt = *(c + ldc); + c1 = *(c + ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= *a * b[0]; - c0_nxt -= *a * b[1]; + c1 -= *a * b[1]; a += 1; b += 2; @@ -784,19 +774,19 @@ static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0 *= b0; - c0_nxt -= c0 * b1; - c0_nxt *= b3; + c1 -= c0 * b1; + c1 *= b3; *(a + 0) = c0; - *(a + 1) = c0_nxt; + *(a + 1) = c1; *(c + 0) = c0; - *(c + ldc) = c0_nxt; + *(c + ldc) = c1; } static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk > 0) + if (bk) { BLASLONG i; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index a90d5fec3..bef87d44d 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -200,7 +200,7 @@ static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } -static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_b0, src_b2, src_b3; @@ -210,10 +210,40 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk if (bk > 0) { - v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3; + v2f64 src_a4, src_a5, src_a6, src_a7; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pbb); - LD_DP4(a + 16, 2, src_a0, src_a1, src_a2, src_a3); - src_b0 = LD_DP(b + 4); + for (i = bk - 1; i--;) + { + pba += 8; + pbb += 2; + + LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(pbb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -228,6 +258,9 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk src_c7 -= src_a3 * src_b; } + a -= 16; + b -= 4; + src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); @@ -256,13 +289,57 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); } -static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3; v2f64 src_b0; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_b1; + + LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(bb); + + aa += 8; + bb += 1; + + for (i = (bk - 1); i--;) + { + LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(bb); + + src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a2 * src_b0; + src_c3 -= src_a3 * src_b0; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + + aa += 8; + bb += 1; + } + + src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a2 * src_b0; + src_c3 -= src_a3 * src_b0; + } + + a -= 8; + b -= 1; + src_b0 = __msa_cast_to_vector_double(*b); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); @@ -289,7 +366,7 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 16; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) @@ -318,6 +395,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } + a -= 16; + b -= 16; + src_b12 = LD_DP(b + 12); src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); @@ -376,7 +456,7 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } -static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; @@ -385,20 +465,31 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk if (bk > 0) { + BLASLONG i; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0; - LD_DP2(a + 8, 2, src_a0, src_a1); - src_b0 = LD_DP(b + 4); + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + src_b0 = LD_DP(bb); - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); - src_c0 -= src_a0 * src_b; - src_c1 -= src_a1 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); - src_c2 -= src_a0 * src_b; - src_c3 -= src_a1 * src_b; + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + aa += 4; + bb += 2; + } } + a -= 8; + b -= 4; + src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); @@ -420,17 +511,36 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } -static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT b0, c0, c1, c2, c3; - b0 = *(b + 0); - c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + + aa += 4; + bb += 1; + } + } + + a -= 4; + + b0 = *(b - 1); + c0 *= b0; c1 *= b0; c2 *= b0; @@ -464,35 +574,27 @@ static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 8, *bb = b + 16; - FLOAT a0, a1, b0, b1, b2, b3; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = bb[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = bb[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = bb[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } } + a -= 8; + b -= 16; + b0 = *b; b4 = *(b + 4); b5 = *(b + 5); @@ -539,44 +641,44 @@ static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + 0 + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; - *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; - *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } -static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b2, b3; - FLOAT c0, c1, c0_nxt, c1_nxt; + FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); - c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); if (bk > 0) { - FLOAT a0, a1, b0, b1; + BLASLONG i; + FLOAT *aa = a, *bb = b; - a0 = a[4]; - a1 = a[5]; + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - b0 = b[4]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; - b1 = b[5]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + aa += 2; + bb += 2; + } } + a -= 4; + b -= 4; + b3 = *(b + 3); b2 = *(b + 2); b0 = *b; @@ -601,20 +703,35 @@ static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk *(c + 1 + ldc) = c1_nxt; } -static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - b0 = *b; + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + aa += 2; + bb += 1; + } + } + + b0 = *(b - 1); c0 *= b0; c1 *= b0; - *(a + 0) = c0; - *(a + 1) = c1; + *(a - 2) = c0; + *(a - 1) = c1; *(c + 0) = c0; *(c + 1) = c1; @@ -622,31 +739,33 @@ static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 4, *bb = b + 16; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; - c0_nxt1 -= aa[0] * bb[1]; - c0_nxt2 -= aa[0] * bb[2]; - c0_nxt3 -= aa[0] * bb[3]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } } + a -= 4; + b -= 16; + b0 = *b; b4 = *(b + 4); b5 = *(b + 5); @@ -658,58 +777,86 @@ static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b14 = *(b + 14); b15 = *(b + 15); - c0_nxt3 *= b15; + c3 *= b15; - c0_nxt2 -= c0_nxt3 * b14; - c0_nxt2 *= b10; + c2 -= c3 * b14; + c2 *= b10; - c0_nxt1 -= c0_nxt3 * b13; - c0_nxt1 -= c0_nxt2 * b9; - c0_nxt1 *= b5; + c1 -= c3 * b13; + c1 -= c2 * b9; + c1 *= b5; - c0 -= c0_nxt3 * b12; - c0 -= c0_nxt2 * b8; - c0 -= c0_nxt1 * b4; + c0 -= c3 * b12; + c0 -= c2 * b8; + c0 -= c1 * b4; c0 *= b0; *(a + 0) = c0; - *(a + 1) = c0_nxt1; - *(a + 2) = c0_nxt2; - *(a + 3) = c0_nxt3; - - *(c) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; } static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b2, b3, c0, c0_nxt; + FLOAT b0, b2, b3, c0, c1; c0 = *(c + 0); - c0_nxt = *(c + ldc); + c1 = *(c + ldc); if (bk > 0) { - c0 -= a[2] * b[4]; - c0_nxt -= a[2] * b[5]; + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= *aa * bb[0]; + c1 -= *aa * bb[1]; + + aa += 1; + bb += 2; + } } + a -= 2; + b -= 4; + b3 = *(b + 3); b2 = *(b + 2); b0 = *b; - c0_nxt *= b3; + c1 *= b3; - c0 -= c0_nxt * b2; + c0 -= c1 * b2; c0 *= b0; *(a + 0) = c0; - *(a + 1) = c0_nxt; + *(a + 1) = c1; *(c + 0) = c0; - *(c + ldc) = c0_nxt; + *(c + ldc) = c1; +} + +static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG i; + + for (i = 0; i < bk; i++) + { + *c -= a[i] * b[i]; + } + } + + *c *= *(b - 1); + *(a - 1) = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, @@ -729,12 +876,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a; c -= ldc; b -= k; - bb = b + (kk - 1); + bb = b + kk; cc = c; for (i = (m >> 3); i--;) { - dsolve_8x1_rt_msa(aa + 8 * kk - 8, bb, cc); + dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk); aa += 8 * k; cc += 8; @@ -744,7 +891,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x1_rt_msa(aa + 4 * kk - 4, bb, cc); + dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk); aa += 4 * k; cc += 4; @@ -752,7 +899,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x1_rt_msa(aa + 2 * kk - 2, bb, cc); + dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk); aa += 2 * k; cc += 2; @@ -760,8 +907,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - *cc *= *bb; - *(aa + kk - 1) = *cc; + dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk); aa += k; cc += 1; @@ -782,7 +928,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (i = (m >> 3); i--;) { - dsolve_8x2_rt_msa(aa + 8 * kk - 16, bb - 4, cc, ldc, k - kk); + dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk); aa += 8 * k; cc += 8; @@ -792,7 +938,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x2_rt_msa(aa + 4 * kk - 8, bb - 4, cc, ldc, k - kk); + dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk); aa += 4 * k; cc += 4; @@ -800,7 +946,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x2_rt_msa(aa + 2 * kk - 4, bb - 4, cc, ldc, k - kk); + dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk); aa += 2 * k; cc += 2; @@ -808,7 +954,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - dsolve_1x2_rt_msa(aa + kk - 2, bb - 4, cc, ldc, k - kk); + dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk); + + aa += k; + cc += 1; } } @@ -836,7 +985,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x4_rt_msa(aa + kk * 4 - 16, bb - 16, cc, ldc, k - kk); + dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk); aa += 4 * k; cc += 4; @@ -844,7 +993,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x4_rt_msa(aa + kk * 2 - 8, bb - 16, cc, ldc, k - kk); + dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk); aa += 2 * k; cc += 2; @@ -852,7 +1001,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - dsolve_1x4_rt_msa(aa + kk - 4, bb - 16, cc, ldc, k - kk); + dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk); aa += k; cc += 1; diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index 3bcc59629..d3a4022d6 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -76,4 +76,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ +} +#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) + #endif /* __MACROS_MSA_H__ */ From 6a2bde7a2de5a0dc5ae95d6b78884e53426129ad Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 17 May 2016 14:45:27 +0200 Subject: [PATCH 33/70] optimized dgemm and dgetrf for POWER8 --- common.h | 7 +++ kernel/power/dgemm_logic_16x4_power8.S | 63 ++++++++++++--------- kernel/power/dgemm_ncopy_macros_4_power8.S | 7 +++ kernel/power/dgemm_tcopy_16_power8.S | 2 +- kernel/power/dgemm_tcopy_logic_16_power8.S | 24 ++++---- kernel/power/dgemm_tcopy_macros_16_power8.S | 40 ++++++------- lapack/getrf/getrf_parallel_omp.c | 9 ++- 7 files changed, 90 insertions(+), 62 deletions(-) diff --git a/common.h b/common.h index c6f7ea2fd..a7342db2c 100644 --- a/common.h +++ b/common.h @@ -332,6 +332,13 @@ typedef int blasint; #endif #endif +#ifdef POWER8 +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + + /* #ifdef PILEDRIVER #ifndef YIELDING diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index edfcc4bcc..cacfab1f6 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * LAPACK-TEST : OK **************************************************************************************/ +#define MY_ALIGN .align 3 srawi. J, N, 2 ble LDGEMM_L4_END @@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN: srawi. I, M, 4 ble LDGEMM_L4x16_END - .align 4 + MY_ALIGN LDGEMM_L4x16_BEGIN_FIRST: li L, -128 @@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST: cmpwi cr0, L, 1 ble LDGEMM_L4x16_SUB4_FIRST - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP_START_FIRST: li T2, 512 @@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST: ble LDGEMM_L4x16_LOOP_END_FIRST mtctr L - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP_FIRST: @@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST: bdnz LDGEMM_L4x16_LOOP_FIRST - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP_END_FIRST: @@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST: addic. L, L, -1 bgt LDGEMM_L4x16_SUB2_FIRST - .align 4 + MY_ALIGN LDGEMM_L4x16_SAVE_FIRST: SAVE4x16 @@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST: LDGEMM_L4x16_END_FIRST: - .align 4 + MY_ALIGN + LDGEMM_L4x16_BEGIN: li L, -128 @@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN: cmpwi cr0, L, 1 ble- LDGEMM_L4x16_SUB4 - .align 4 + MY_ALIGN + LDGEMM_L4x16_LOOP_START: li o40, 40 @@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START: ble- LDGEMM_L4x16_LOOP_END mtctr L - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP: - dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE - // addic. L, L, -1 KERNEL4x16_L2 bdnz+ LDGEMM_L4x16_LOOP - .align 4 + + MY_ALIGN LDGEMM_L4x16_LOOP_END: @@ -261,6 +263,8 @@ LDGEMM_L4x16_LOOP_END: b LDGEMM_L4x16_SUB1 + MY_ALIGN + LDGEMM_L4x16_SUB4: KERNEL4x16_SUBI1 @@ -268,6 +272,8 @@ LDGEMM_L4x16_SUB4: b LDGEMM_L4x16_SUB1 + MY_ALIGN + LDGEMM_L4x16_SUB0: andi. L, K, 1 @@ -278,11 +284,15 @@ LDGEMM_L4x16_SUB0: ble LDGEMM_L4x16_SAVE b LDGEMM_L4x16_SUB2 + MY_ALIGN + LDGEMM_L4x16_SUB1: andi. L, K, 1 ble LDGEMM_L4x16_SAVE + MY_ALIGN + LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 @@ -290,7 +300,8 @@ LDGEMM_L4x16_SUB2: addic. L, L, -1 bgt LDGEMM_L4x16_SUB2 - .align 4 + MY_ALIGN + LDGEMM_L4x16_SAVE: SAVE4x16 @@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x8_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x8_LOOP: @@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x4_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x4_LOOP: @@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x2_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x2_LOOP: @@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x1_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x1_LOOP: @@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x16_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x16_LOOP: @@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x8_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x8_LOOP: @@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x4_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x4_LOOP: @@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x2_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x2_LOOP: @@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x1_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x1_LOOP: @@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x16_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x16_LOOP: @@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x8_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x8_LOOP: @@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x4_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x4_LOOP: @@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x2_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x2_LOOP: @@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x1_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x1_LOOP: diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S index 9b07d73f5..fafb09877 100644 --- a/kernel/power/dgemm_ncopy_macros_4_power8.S +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs62, vs7, vs15, 3 xxpermdi vs63, vs23, vs31, 3 + dcbt BO, PREB stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO @@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs39, o112, BO addi BO, BO, 128 + dcbt BO, PREB + stxvd2x vs40, o0, BO stxvd2x vs41, o16, BO stxvd2x vs42, o32, BO @@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs47, o112, BO addi BO, BO, 128 + dcbt BO, PREB + stxvd2x vs48, o0, BO stxvd2x vs49, o16, BO stxvd2x vs50, o32, BO @@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs55, o112, BO addi BO, BO, 128 + dcbt BO, PREB + stxvd2x vs56, o0, BO stxvd2x vs57, o16, BO stxvd2x vs58, o32, BO diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index eca78bac4..eb37877e0 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add B2, B2, B add B1, B1, B - li PREA, 256 + li PREA, 384 addi PREB, M16, 128 li o8, 8 diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S index 28fc74793..3c34a6167 100644 --- a/kernel/power/dgemm_tcopy_logic_16_power8.S +++ b/kernel/power/dgemm_tcopy_logic_16_power8.S @@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN: ble DCOPYT_L4x8_BEGIN mr BO, B16 + addi T2, M16, 384 + mtctr J .align 5 DCOPYT_L4x16_LOOP: -/* - addi T1, PREB, 128 - addi T2, PREB, 256 -*/ + addi T1, M16, 256 + dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA -/* - dcbtst BO, M16 - dcbtst BO, PREB - dcbtst BO, T1 - dcbtst BO, T2 -*/ + + dcbt BO, M16 + dcbt BO, PREB + dcbt BO, T1 + dcbt BO, T2 + COPY_4x16 add BO, BO, M16 - addic. J, J, -1 - bgt DCOPYT_L4x16_LOOP + // addic. J, J, -1 + bdnz+ DCOPYT_L4x16_LOOP DCOPYT_L4x8_BEGIN: diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index aef03d7cf..333e23105 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -46,52 +46,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs35, o48, A0 addi A0, A0, 64 - lxvd2x vs36, o0, A0 - lxvd2x vs37, o16, A0 - lxvd2x vs38, o32, A0 - lxvd2x vs39, o48, A0 - addi A0, A0, 64 - - lxvd2x vs40, o0, A1 lxvd2x vs41, o16, A1 lxvd2x vs42, o32, A1 lxvd2x vs43, o48, A1 addi A1, A1, 64 - lxvd2x vs44, o0, A1 - lxvd2x vs45, o16, A1 - lxvd2x vs46, o32, A1 - lxvd2x vs47, o48, A1 - addi A1, A1, 64 - - lxvd2x vs48, o0, A2 lxvd2x vs49, o16, A2 lxvd2x vs50, o32, A2 lxvd2x vs51, o48, A2 addi A2, A2, 64 - lxvd2x vs52, o0, A2 - lxvd2x vs53, o16, A2 - lxvd2x vs54, o32, A2 - lxvd2x vs55, o48, A2 - addi A2, A2, 64 - - lxvd2x vs56, o0, A3 lxvd2x vs57, o16, A3 lxvd2x vs58, o32, A3 lxvd2x vs59, o48, A3 addi A3, A3, 64 + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + lxvd2x vs60, o0, A3 lxvd2x vs61, o16, A3 lxvd2x vs62, o32, A3 lxvd2x vs63, o48, A3 addi A3, A3, 64 - mr T1, BO stxvd2x vs32, o0, T1 diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index 7e2319718..6b8cbda2f 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; - if (blocking <= GEMM_UNROLL_N * 2) { +#ifdef POWER8 + if (blocking <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } +#else + if (blocking <= GEMM_UNROLL_N*2) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } +#endif sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); From 956be69e1dd0ab298f8f6f5d37119a266166349d Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 17 May 2016 16:19:53 +0200 Subject: [PATCH 34/70] optimized getrf_single.c for POWER8 --- lapack/getrf/getrf_single.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index e60a16c11..9f0f36b78 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -77,10 +77,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; +#ifdef POWER8 + if (blocking <= GEMM_UNROLL_N) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } +#else if (blocking <= GEMM_UNROLL_N * 2) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } +#endif sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); From c4ba40e308844aab12d67118cedcb7ffda3be24e Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Thu, 19 May 2016 11:04:42 +0530 Subject: [PATCH 35/70] SGEMM optimization for MIPS P5600 and I6400 using MSA. Unrolled k loop in DGEMM kernel function Signed-off-by: Shivraj Patil --- kernel/mips/KERNEL.P5600 | 10 +- kernel/mips/dgemm_kernel_8x4_msa.c | 416 ++++++- kernel/mips/macros_msa.h | 67 ++ kernel/mips/sgemm_kernel_8x8_msa.c | 1806 ++++++++++++++++++++++++++++ kernel/mips/sgemm_ncopy_8_msa.c | 177 +++ kernel/mips/sgemm_tcopy_8_msa.c | 292 +++++ param.h | 8 +- 7 files changed, 2755 insertions(+), 21 deletions(-) create mode 100644 kernel/mips/sgemm_kernel_8x8_msa.c create mode 100644 kernel/mips/sgemm_ncopy_8_msa.c create mode 100644 kernel/mips/sgemm_tcopy_8_msa.c diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 0ac30d77c..d7d49055f 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -85,11 +85,11 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c index 8d9e3455e..1f0a2aee6 100644 --- a/kernel/mips/dgemm_kernel_8x4_msa.c +++ b/kernel/mips/dgemm_kernel_8x4_msa.c @@ -90,7 +90,70 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 8; pb0 += 4; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + res10 += src_a2 * src_b; + res11 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + res14 += src_a2 * src_b; + res15 += src_a3 * src_b; + + pa0 += 8; + pb0 += 4; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + res10 += src_a2 * src_b; + res11 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + res14 += src_a2 * src_b; + res15 += src_a3 * src_b; + + pa0 += 8; + pb0 += 4; + } + + if ((k - 1) & 1) { LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2(pb0, 2, src_b0, src_b1); @@ -185,7 +248,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 4; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pa0 += 4; + pb0 += 4; + + LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pa0 += 4; + pb0 += 4; + } + + if ((k - 1) & 1) { LD_DP2(pa0, 2, src_a0, src_a1); LD_DP2(pb0, 2, src_b0, src_b1); @@ -257,7 +367,46 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 4; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + src_a0 = LD_DP(pa0); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 += src_a0 * src_b; + + pa0 += 2; + pb0 += 4; + + src_a0 = LD_DP(pa0); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 += src_a0 * src_b; + + pa0 += 2; + pb0 += 4; + } + + if ((k - 1) & 1) { src_a0 = LD_DP(pa0); LD_DP2(pb0, 2, src_b0, src_b1); @@ -319,7 +468,42 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 4; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -389,7 +573,46 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 8; pb0 += 2; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + pa0 += 8; + pb0 += 2; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + pa0 += 8; + pb0 += 2; + } + + if ((k - 1) & 1) { LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); @@ -447,7 +670,38 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 2; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pa0 += 4; + pb0 += 2; + + LD_DP2(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pa0 += 4; + pb0 += 2; + } + + if ((k - 1) & 1) { LD_DP2(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); @@ -495,7 +749,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 2; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + src_a0 = LD_DP(pa0); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + pa0 += 2; + pb0 += 2; + + src_a0 = LD_DP(pa0); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + pa0 += 2; + pb0 += 2; + } + + if ((k - 1) & 1) { src_a0 = LD_DP(pa0); src_b0 = LD_DP(pb0); @@ -537,7 +818,30 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 2; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -587,7 +891,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 8; pb0 += 1; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + pa0 += 8; + pb0 += 1; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + pa0 += 8; + pb0 += 1; + } + + if ((k - 1) & 1) { LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; @@ -628,7 +959,30 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 1; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pa0 += 4; + pb0 += 1; + + LD_DP2(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pa0 += 4; + pb0 += 1; + } + + if ((k - 1) & 1) { LD_DP2(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; @@ -664,7 +1018,28 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 1; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + } + + if ((k - 1) & 1) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -696,7 +1071,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 1; - for (l = (k - 1); l--;) + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + + if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index d3a4022d6..fad6dd6cd 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -30,12 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_SP(...) LD_W(v4f32, __VA_ARGS__) + #define LD_D(RTYPE, psrc) *((RTYPE *)(psrc)) #define LD_DP(...) LD_D(v2f64, __VA_ARGS__) +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SP(...) ST_W(v4f32, __VA_ARGS__) + #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) +/* Description : Load 2 vectors of single precision floating point elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - single precision floating point +*/ +#define LD_SP2(psrc, stride, out0, out1) \ +{ \ + out0 = LD_SP((psrc)); \ + out1 = LD_SP((psrc) + stride); \ +} + /* Description : Load 2 vectors of double precision floating point elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -53,6 +70,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LD_DP2(psrc + 2 * stride, stride, out2, out3) \ } +/* Description : Store vectors of single precision floating point elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 single precision floating point elements from 'in0' to (pdst) + Store 4 single precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_SP2(in0, in1, pdst, stride) \ +{ \ + ST_SP(in0, (pdst)); \ + ST_SP(in1, (pdst) + stride); \ +} + +#define ST_SP4(in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_SP2(in0, in1, (pdst), stride); \ + ST_SP2(in2, in3, (pdst + 2 * stride), stride); \ +} + +#define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + ST_SP4(in0, in1, in2, in3, (pdst), stride); \ + ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \ +} + /* Description : Store vectors of double precision floating point elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 2 double precision floating point elements from 'in0' to (pdst) @@ -83,6 +123,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Details : Right half of byte elements from 'in0' and 'in1' are interleaved and written to 'out0' */ +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ +} +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ @@ -90,4 +137,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE +*/ +#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \ + out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \ + out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ + out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ +} + +#define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) + #endif /* __MACROS_MSA_H__ */ diff --git a/kernel/mips/sgemm_kernel_8x8_msa.c b/kernel/mips/sgemm_kernel_8x8_msa.c new file mode 100644 index 000000000..611ebabac --- /dev/null +++ b/kernel/mips/sgemm_kernel_8x8_msa.c @@ -0,0 +1,1806 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, + FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pa0, *pb0; + FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + FLOAT a0, a1; + FLOAT b0, b1, b2, b3, b4, b5, b6, b7; + v4f32 v_alpha = {alpha, alpha, alpha, alpha}; + v4f32 src_a0, src_a1, src_b, src_b0, src_b1; + v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + for (j = (n / 8); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + pa0 = A; + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 = src_a0 * src_b; + res7 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 = src_a0 * src_b; + res9 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 = src_a0 * src_b; + res11 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 = src_a0 * src_b; + res13 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 = src_a0 * src_b; + res15 = src_a1 * src_b; + + pa0 += 8; + pb0 += 8; + + for (l = ((k - 1) / 2); l--;) + { + LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 += src_a0 * src_b; + res11 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 += src_a0 * src_b; + res15 += src_a1 * src_b; + + pa0 += 8; + pb0 += 8; + + LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 += src_a0 * src_b; + res11 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 += src_a0 * src_b; + res15 += src_a1 * src_b; + + pa0 += 8; + pb0 += 8; + } + + if ((k - 1) & 1) + { + LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res10 += src_a0 * src_b; + res11 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res14 += src_a0 * src_b; + res15 += src_a1 * src_b; + + pa0 += 8; + pb0 += 8; + } + + LD_SP2(pc0, 4, dst0, dst1); + LD_SP2(pc1, 4, dst2, dst3); + LD_SP2(pc2, 4, dst4, dst5); + LD_SP2(pc3, 4, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_SP2(dst0, dst1, pc0, 4); + ST_SP2(dst2, dst3, pc1, 4); + ST_SP2(dst4, dst5, pc2, 4); + ST_SP2(dst6, dst7, pc3, 4); + + LD_SP2(pc4, 4, dst0, dst1); + LD_SP2(pc5, 4, dst2, dst3); + LD_SP2(pc6, 4, dst4, dst5); + LD_SP2(pc7, 4, dst6, dst7); + + dst0 += res8 * v_alpha; + dst1 += res9 * v_alpha; + dst2 += res10 * v_alpha; + dst3 += res11 * v_alpha; + dst4 += res12 * v_alpha; + dst5 += res13 * v_alpha; + dst6 += res14 * v_alpha; + dst7 += res15 * v_alpha; + + ST_SP2(dst0, dst1, pc4, 4); + ST_SP2(dst2, dst3, pc5, 4); + ST_SP2(dst4, dst5, pc6, 4); + ST_SP2(dst6, dst7, pc7, 4); + + pc0 += 8; + pc1 += 8; + pc2 += 8; + pc3 += 8; + pc4 += 8; + pc5 += 8; + pc6 += 8; + pc7 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + src_a0 = LD_SP(pa0); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 = src_a0 * src_b; + + pa0 += 4; + pb0 += 8; + + for (l = ((k - 1) / 2); l--;) + { + src_a0 = LD_SP(pa0); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 += src_a0 * src_b; + + pa0 += 4; + pb0 += 8; + + src_a0 = LD_SP(pa0); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 += src_a0 * src_b; + + pa0 += 4; + pb0 += 8; + } + + if ((k - 1) & 1) + { + src_a0 = LD_SP(pa0); + LD_SP2(pb0, 4, src_b0, src_b1); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); + res4 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); + res5 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); + res6 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); + res7 += src_a0 * src_b; + + pa0 += 4; + pb0 += 8; + } + + dst0 = LD_SP(pc0); + dst1 = LD_SP(pc1); + dst2 = LD_SP(pc2); + dst3 = LD_SP(pc3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_SP(dst0, pc0); + ST_SP(dst1, pc1); + ST_SP(dst2, pc2); + ST_SP(dst3, pc3); + + dst0 = LD_SP(pc4); + dst1 = LD_SP(pc5); + dst2 = LD_SP(pc6); + dst3 = LD_SP(pc7); + + dst0 += res4 * v_alpha; + dst1 += res5 * v_alpha; + dst2 += res6 * v_alpha; + dst3 += res7 * v_alpha; + + ST_SP(dst0, pc4); + ST_SP(dst1, pc5); + ST_SP(dst2, pc6); + ST_SP(dst3, pc7); + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + pc4 += 4; + pc5 += 4; + pc6 += 4; + pc7 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + b1 = pb0[1]; + tmp2 = a0 * b1; + tmp3 = a1 * b1; + + b2 = pb0[2]; + tmp4 = a0 * b2; + tmp5 = a1 * b2; + + b3 = pb0[3]; + tmp6 = a0 * b3; + tmp7 = a1 * b3; + + b4 = pb0[4]; + tmp8 = a0 * b4; + tmp9 = a1 * b4; + + b5 = pb0[5]; + tmp10 = a0 * b5; + tmp11 = a1 * b5; + + b6 = pb0[6]; + tmp12 = a0 * b6; + tmp13 = a1 * b6; + + b7 = pb0[7]; + tmp14 = a0 * b7; + tmp15 = a1 * b7; + + pa0 += 2; + pb0 += 8; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + b4 = pb0[4]; + tmp8 += a0 * b4; + tmp9 += a1 * b4; + + b5 = pb0[5]; + tmp10 += a0 * b5; + tmp11 += a1 * b5; + + b6 = pb0[6]; + tmp12 += a0 * b6; + tmp13 += a1 * b6; + + b7 = pb0[7]; + tmp14 += a0 * b7; + tmp15 += a1 * b7; + + pa0 += 2; + pb0 += 8; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + b4 = pb0[4]; + tmp8 += a0 * b4; + tmp9 += a1 * b4; + + b5 = pb0[5]; + tmp10 += a0 * b5; + tmp11 += a1 * b5; + + b6 = pb0[6]; + tmp12 += a0 * b6; + tmp13 += a1 * b6; + + b7 = pb0[7]; + tmp14 += a0 * b7; + tmp15 += a1 * b7; + + pa0 += 2; + pb0 += 8; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + b4 = pb0[4]; + tmp8 += a0 * b4; + tmp9 += a1 * b4; + + b5 = pb0[5]; + tmp10 += a0 * b5; + tmp11 += a1 * b5; + + b6 = pb0[6]; + tmp12 += a0 * b6; + tmp13 += a1 * b6; + + b7 = pb0[7]; + tmp14 += a0 * b7; + tmp15 += a1 * b7; + + pa0 += 2; + pb0 += 8; + } + + tmp0 = alpha * tmp0; + tmp2 = alpha * tmp2; + tmp4 = alpha * tmp4; + tmp6 = alpha * tmp6; + tmp8 = alpha * tmp8; + tmp10 = alpha * tmp10; + tmp12 = alpha * tmp12; + tmp14 = alpha * tmp14; + + pc0[0] += tmp0; + pc1[0] += tmp2; + pc2[0] += tmp4; + pc3[0] += tmp6; + pc4[0] += tmp8; + pc5[0] += tmp10; + pc6[0] += tmp12; + pc7[0] += tmp14; + + tmp1 = alpha * tmp1; + tmp3 = alpha * tmp3; + tmp5 = alpha * tmp5; + tmp7 = alpha * tmp7; + tmp9 = alpha * tmp9; + tmp11 = alpha * tmp11; + tmp13 = alpha * tmp13; + tmp15 = alpha * tmp15; + + pc0[1] += tmp1; + pc1[1] += tmp3; + pc2[1] += tmp5; + pc3[1] += tmp7; + pc4[1] += tmp9; + pc5[1] += tmp11; + pc6[1] += tmp13; + pc7[1] += tmp15; + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + pc4 += 2; + pc5 += 2; + pc6 += 2; + pc7 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + b2 = pb0[2]; + tmp2 = a0 * b2; + + b3 = pb0[3]; + tmp3 = a0 * b3; + + b4 = pb0[4]; + tmp4 = a0 * b4; + + b5 = pb0[5]; + tmp5 = a0 * b5; + + b6 = pb0[6]; + tmp6 = a0 * b6; + + b7 = pb0[7]; + tmp7 = a0 * b7; + + pa0 += 1; + pb0 += 8; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + b4 = pb0[4]; + tmp4 += a0 * b4; + + b5 = pb0[5]; + tmp5 += a0 * b5; + + b6 = pb0[6]; + tmp6 += a0 * b6; + + b7 = pb0[7]; + tmp7 += a0 * b7; + + pa0 += 1; + pb0 += 8; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + b4 = pb0[4]; + tmp4 += a0 * b4; + + b5 = pb0[5]; + tmp5 += a0 * b5; + + b6 = pb0[6]; + tmp6 += a0 * b6; + + b7 = pb0[7]; + tmp7 += a0 * b7; + + pa0 += 1; + pb0 += 8; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + b4 = pb0[4]; + tmp4 += a0 * b4; + + b5 = pb0[5]; + tmp5 += a0 * b5; + + b6 = pb0[6]; + tmp6 += a0 * b6; + + b7 = pb0[7]; + tmp7 += a0 * b7; + + pa0 += 1; + pb0 += 8; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + tmp4 = alpha * tmp4; + tmp5 = alpha * tmp5; + tmp6 = alpha * tmp6; + tmp7 = alpha * tmp7; + + pc0[0] += tmp0; + pc1[0] += tmp1; + pc2[0] += tmp2; + pc3[0] += tmp3; + pc4[0] += tmp4; + pc5[0] += tmp5; + pc6[0] += tmp6; + pc7[0] += tmp7; + + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + pc4 += 1; + pc5 += 1; + pc6 += 1; + pc7 += 1; + } + + l = (k << 3); + B = B + l; + i = (ldc << 3); + C = C + i; + } + + for (j = ((n & 4) / 4); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 = src_a0 * src_b; + res7 = src_a1 * src_b; + + pa0 += 8; + pb0 += 4; + + for (l = ((k - 1) / 2); l--;) + { + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pa0 += 8; + pb0 += 4; + + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pa0 += 8; + pb0 += 4; + } + + if ((k - 1) & 1) + { + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pa0 += 8; + pb0 += 4; + } + + LD_SP2(pc0, 4, dst0, dst1); + LD_SP2(pc1, 4, dst2, dst3); + LD_SP2(pc2, 4, dst4, dst5); + LD_SP2(pc3, 4, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_SP2(dst0, dst1, pc0, 4); + ST_SP2(dst2, dst3, pc1, 4); + ST_SP2(dst4, dst5, pc2, 4); + ST_SP2(dst6, dst7, pc3, 4); + + pc0 += 8; + pc1 += 8; + pc2 += 8; + pc3 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 = src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + + for (l = ((k - 1) / 2); l--;) + { + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + } + + if ((k - 1) & 1) + { + src_a0 = LD_SP(pa0); + src_b0 = LD_SP(pb0); + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); + res2 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); + res3 += src_a0 * src_b; + + pa0 += 4; + pb0 += 4; + } + + dst0 = LD_SP(pc0); + dst1 = LD_SP(pc1); + dst2 = LD_SP(pc2); + dst3 = LD_SP(pc3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_SP(dst0, pc0); + ST_SP(dst1, pc1); + ST_SP(dst2, pc2); + ST_SP(dst3, pc3); + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + b1 = pb0[1]; + tmp2 = a0 * b1; + tmp3 = a1 * b1; + + b2 = pb0[2]; + tmp4 = a0 * b2; + tmp5 = a1 * b2; + + b3 = pb0[3]; + tmp6 = a0 * b3; + tmp7 = a1 * b3; + + pa0 += 2; + pb0 += 4; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + pa0 += 2; + pb0 += 4; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + pa0 += 2; + pb0 += 4; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + b2 = pb0[2]; + tmp4 += a0 * b2; + tmp5 += a1 * b2; + + b3 = pb0[3]; + tmp6 += a0 * b3; + tmp7 += a1 * b3; + + pa0 += 2; + pb0 += 4; + } + + tmp0 = alpha * tmp0; + tmp2 = alpha * tmp2; + tmp4 = alpha * tmp4; + tmp6 = alpha * tmp6; + + pc0[0] += tmp0; + pc1[0] += tmp2; + pc2[0] += tmp4; + pc3[0] += tmp6; + + tmp1 = alpha * tmp1; + tmp3 = alpha * tmp3; + tmp5 = alpha * tmp5; + tmp7 = alpha * tmp7; + + pc0[1] += tmp1; + pc1[1] += tmp3; + pc2[1] += tmp5; + pc3[1] += tmp7; + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + b2 = pb0[2]; + tmp2 = a0 * b2; + + b3 = pb0[3]; + tmp3 = a0 * b3; + + pa0 += 1; + pb0 += 4; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + + pc0[0] += tmp0; + pc1[0] += tmp1; + pc2[0] += tmp2; + pc3[0] += tmp3; + + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + } + + l = (k << 2); + B = B + l; + i = (ldc << 2); + C = C + i; + } + + for (j = ((n & 2) / 2); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + pa0 += 8; + pb0 += 2; + + for (l = ((k - 1) / 2); l--;) + { + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pa0 += 8; + pb0 += 2; + + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pa0 += 8; + pb0 += 2; + } + + if ((k - 1) & 1) + { + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pa0 += 8; + pb0 += 2; + } + + LD_SP2(pc0, 4, dst0, dst1); + LD_SP2(pc1, 4, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_SP2(dst0, dst1, pc0, 4); + ST_SP2(dst2, dst3, pc1, 4); + + pc0 += 8; + pc1 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 = src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + + for (l = ((k - 1) / 2); l--;) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + } + + if ((k - 1) & 1) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + src_b0[1] = pb0[1]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); + res1 += src_a0 * src_b; + + pa0 += 4; + pb0 += 2; + } + + dst0 = LD_SP(pc0); + dst1 = LD_SP(pc1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + + ST_SP(dst0, pc0); + ST_SP(dst1, pc1); + + pc0 += 4; + pc1 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + b1 = pb0[1]; + tmp2 = a0 * b1; + tmp3 = a1 * b1; + + pa0 += 2; + pb0 += 2; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + pa0 += 2; + pb0 += 2; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + pa0 += 2; + pb0 += 2; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + b1 = pb0[1]; + tmp2 += a0 * b1; + tmp3 += a1 * b1; + + pa0 += 2; + pb0 += 2; + } + + tmp0 = alpha * tmp0; + tmp2 = alpha * tmp2; + + pc0[0] += tmp0; + pc1[0] += tmp2; + + tmp1 = alpha * tmp1; + tmp3 = alpha * tmp3; + + pc0[1] += tmp1; + pc1[1] += tmp3; + + pc0 += 2; + pc1 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + pa0 += 1; + pb0 += 2; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + + pc0[0] += tmp0; + pc1[0] += tmp1; + + pc0 += 1; + pc1 += 1; + } + + l = (k << 1); + B = B + l; + i = (ldc << 1); + C = C + i; + } + + for (j = (n & 1); j--;) + { + pc0 = C; + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + pa0 += 8; + pb0 += 1; + + for (l = ((k - 1) / 2); l--;) + { + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pa0 += 8; + pb0 += 1; + + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pa0 += 8; + pb0 += 1; + } + + if ((k - 1) & 1) + { + LD_SP2(pa0, 4, src_a0, src_a1); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pa0 += 8; + pb0 += 1; + } + + LD_SP2(pc0, 4, dst0, dst1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + + ST_SP2(dst0, dst1, pc0, 4); + + pc0 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 = src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + + for (l = ((k - 1) / 2); l--;) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + } + + if ((k - 1) & 1) + { + src_a0 = LD_SP(pa0); + src_b0[0] = pb0[0]; + + src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); + res0 += src_a0 * src_b; + + pa0 += 4; + pb0 += 1; + } + + dst0 = LD_SP(pc0); + + dst0 += res0 * v_alpha; + + ST_SP(dst0, pc0); + + pc0 += 4; + } + + for (i = (m & 2) / 2; i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + a1 = pa0[1]; + tmp1 = a1 * b0; + + pa0 += 2; + pb0 += 1; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + pa0 += 2; + pb0 += 1; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + pa0 += 2; + pb0 += 1; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + a1 = pa0[1]; + tmp1 += a1 * b0; + + pa0 += 2; + pb0 += 1; + } + + tmp0 = alpha * tmp0; + pc0[0] += tmp0; + + tmp1 = alpha * tmp1; + pc0[1] += tmp1; + + pc0 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + pa0 += 1; + pb0 += 1; + + for (l = ((k - 1) / 2); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + + if ((k - 1) & 1) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + + pc0[0] += alpha * tmp0; + + pc0 += 1; + } + + l = (k << 0); + B = B + l; + i = (ldc << 0); + C = C + i; + } + + return 0; +} diff --git a/kernel/mips/sgemm_ncopy_8_msa.c b/kernel/mips/sgemm_ncopy_8_msa.c new file mode 100644 index 000000000..71048f1c3 --- /dev/null +++ b/kernel/mips/sgemm_ncopy_8_msa.c @@ -0,0 +1,177 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0; + FLOAT *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *psrc5, *psrc6, *psrc7, *psrc8; + FLOAT *pdst; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v4f32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 3); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + LD_SP2(psrc3, 4, src4, src5); + LD_SP2(psrc4, 4, src6, src7); + LD_SP2(psrc5, 4, src8, src9); + LD_SP2(psrc6, 4, src10, src11); + LD_SP2(psrc7, 4, src12, src13); + LD_SP2(psrc8, 4, src14, src15); + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + + TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6); + TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5, + dst7); + TRANSPOSE4x4_SP_SP(src1, src3, src5, src7, dst8, dst10, dst12, + dst14); + TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13, + dst15); + + ST_SP2(dst0, dst1, pdst, 4); + ST_SP2(dst2, dst3, pdst + 8, 4); + ST_SP2(dst4, dst5, pdst + 16, 4); + ST_SP2(dst6, dst7, pdst + 24, 4); + ST_SP2(dst8, dst9, pdst + 32, 4); + ST_SP2(dst10, dst11, pdst + 40, 4); + ST_SP2(dst12, dst13, pdst + 48, 4); + ST_SP2(dst14, dst15, pdst + 56, 4); + pdst += 64; + } + + for (i = (m & 7); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + *pdst++ = *psrc5++; + *pdst++ = *psrc6++; + *pdst++ = *psrc7++; + *pdst++ = *psrc8++; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc3); + src3 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3); + + ST_SP2(dst0, dst1, pdst, 4); + ST_SP2(dst2, dst3, pdst + 8, 4); + pdst += 16; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 1); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + + if (m & 1) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = m; i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/sgemm_tcopy_8_msa.c b/kernel/mips/sgemm_tcopy_8_msa.c new file mode 100644 index 000000000..7d4aecb4b --- /dev/null +++ b/kernel/mips/sgemm_tcopy_8_msa.c @@ -0,0 +1,292 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0; + FLOAT *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *psrc5, *psrc6, *psrc7, *psrc8; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~7); + pdst3 = dst + m * (n & ~3); + pdst4 = dst + m * (n & ~1); + + for (j = (m >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + pdst1 = pdst0; + pdst0 += 64; + + for (i = (n >> 3); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + LD_SP2(psrc3, 4, src4, src5); + LD_SP2(psrc4, 4, src6, src7); + LD_SP2(psrc5, 4, src8, src9); + LD_SP2(psrc6, 4, src10, src11); + LD_SP2(psrc7, 4, src12, src13); + LD_SP2(psrc8, 4, src14, src15); + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + + ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); + ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 32, 4); + pdst1 += m * 8; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc3); + src3 = LD_SP(psrc4); + src4 = LD_SP(psrc5); + src5 = LD_SP(psrc6); + src6 = LD_SP(psrc7); + src7 = LD_SP(psrc8); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + psrc5 += 4; + psrc6 += 4; + psrc7 += 4; + psrc8 += 4; + + ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); + pdst2 += 32; + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc4++; + *pdst3++ = *psrc4++; + *pdst3++ = *psrc5++; + *pdst3++ = *psrc5++; + *pdst3++ = *psrc6++; + *pdst3++ = *psrc6++; + *pdst3++ = *psrc7++; + *pdst3++ = *psrc7++; + *pdst3++ = *psrc8++; + *pdst3++ = *psrc8++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + *pdst4++ = *psrc5++; + *pdst4++ = *psrc6++; + *pdst4++ = *psrc7++; + *pdst4++ = *psrc8++; + } + } + + if (m & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 3); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + LD_SP2(psrc3, 4, src4, src5); + LD_SP2(psrc4, 4, src6, src7); + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + + ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); + pdst1 += 8 * m; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc3); + src3 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ST_SP4(src0, src1, src2, src3, pdst2, 4); + pdst2 += 16; + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc4++; + *pdst3++ = *psrc4++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 3); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + psrc1 += 8; + psrc2 += 8; + + ST_SP4(src0, src1, src2, src3, pdst1, 4); + pdst1 += 8 * m; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ST_SP2(src0, src1, pdst2, 4); + pdst2 += 8; + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc2++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + psrc0 += lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 3); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + psrc1 += 8; + + ST_SP2(src0, src1, pdst1, 4); + pdst1 += 8 * m; + } + + if (n & 4) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst2); + pdst2 += 4; + } + + if (n & 2) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc1++; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + } + } + + return 0; +} diff --git a/param.h b/param.h index 67f0578e8..fdc9d1104 100644 --- a/param.h +++ b/param.h @@ -2182,8 +2182,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -2221,8 +2221,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 From e12cff87b86615f5a4643d246a6c1963a0e81ca5 Mon Sep 17 00:00:00 2001 From: Vicente Olivert Riera Date: Thu, 19 May 2016 10:35:45 +0100 Subject: [PATCH 36/70] Makefile.system: P5600 and I6400 cores need -mmsa Signed-off-by: Vicente Olivert Riera --- Makefile.system | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile.system b/Makefile.system index 73361fed1..24a7a6406 100644 --- a/Makefile.system +++ b/Makefile.system @@ -521,6 +521,16 @@ CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif +ifeq ($(CORE), P5600) +CCOMMON_OPT += -mmsa +FCOMMON_OPT += -mmsa +endif + +ifeq ($(CORE), I6400) +CCOMMON_OPT += -mmsa +FCOMMON_OPT += -mmsa +endif + ifeq ($(OSNAME), AIX) BINARY_DEFINED = 1 endif From ad9f3178705130d590cec55475b7039d3ae4c1ad Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Fri, 20 May 2016 10:59:03 +0530 Subject: [PATCH 37/70] STRSM optimization for MIPS P5600 and I6400 using MSA Signed-off-by: Kaustubh Raste --- CONTRIBUTORS.md | 1 + kernel/mips/KERNEL.P5600 | 8 +- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 6 +- kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 6 +- kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 2 +- kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 2 +- kernel/mips/macros_msa.h | 24 + kernel/mips/strsm_kernel_LN_8x8_msa.c | 2133 ++++++++++++++++++++++++ kernel/mips/strsm_kernel_LT_8x8_msa.c | 2099 ++++++++++++++++++++++++ kernel/mips/strsm_kernel_RN_8x8_msa.c | 2162 +++++++++++++++++++++++++ kernel/mips/strsm_kernel_RT_8x8_msa.c | 2118 ++++++++++++++++++++++++ 11 files changed, 8549 insertions(+), 12 deletions(-) create mode 100644 kernel/mips/strsm_kernel_LN_8x8_msa.c create mode 100644 kernel/mips/strsm_kernel_LT_8x8_msa.c create mode 100644 kernel/mips/strsm_kernel_RN_8x8_msa.c create mode 100644 kernel/mips/strsm_kernel_RT_8x8_msa.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a13308f71..5ecf32b91 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -160,3 +160,4 @@ In chronological order: * Kaustubh Raste * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA + * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index d7d49055f..802f0e0e5 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index d0792bf85..dc21dab45 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (j = (n >> 2); j--;) { - kk = m; + kk = m + offset; if (m & 7) { @@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (n & 2) { - kk = m; + kk = m + offset; if (m & 7) { @@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (n & 1) { - kk = m; + kk = m + offset; if (m & 7) { diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index db902c0de..897fd313b 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (j = (n >> 2); j--;) { - kk = 0; + kk = offset; aa = a; cc = c; @@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (n & 2) { - kk = 0; + kk = offset; aa = a; cc = c; @@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (n & 1) { - kk = 0; + kk = offset; aa = a; cc = c; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index 518daad13..44313241e 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, BLASLONG i, j, kk; FLOAT *aa, *cc; - kk = 0; + kk = -offset; for (j = (n >> 2); j--;) { diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index bef87d44d..49274e5bc 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, BLASLONG i, j, kk; FLOAT *aa, *cc, *bb; - kk = n; + kk = n - offset; c += n * ldc; b += n * k; diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index fad6dd6cd..ae85220c6 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) +/* Description : Indexed word element values are replicated to all + elements in output vector + Arguments : Inputs - in, stidx + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'stidx' element value from 'in' vector is replicated to all + elements in 'out0' vector + 'stidx + 1' element value from 'in' vector is replicated to all + elements in 'out1' vector + Valid index range for word operation is 0-3 +*/ +#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ + out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ +} + +#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ +{ \ + SPLATI_W2(RTYPE, in, 0, out0, out1); \ + SPLATI_W2(RTYPE, in, 2, out2, out3); \ +} +#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) + /* Description : Transpose 4x4 block with word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c new file mode 100644 index 000000000..3db7da3c4 --- /dev/null +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -0,0 +1,2133 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; + v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35, src_a36; + v4f32 src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; + v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; + v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + aa += 8; + bb += 8; + + for (k = (bk - 1); k--;) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + + aa += 8; + bb += 8; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + a -= 64; + b -= 64; + + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, + res_c12, res_c13, res_c14, res_c15); + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, + res_c8, res_c9, res_c10, res_c11); + + src_a = LD_SP(a + 60); + SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); + src_a = LD_SP(a + 56); + SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); + + res_c7 *= src_a63; + res_c15 *= src_a63; + + res_c6 -= res_c7 * src_a62; + res_c14 -= res_c15 * src_a62; + + res_c5 -= res_c7 * src_a61; + res_c13 -= res_c15 * src_a61; + + res_c4 -= res_c7 * src_a60; + res_c12 -= res_c15 * src_a60; + + res_c3 -= res_c7 * src_a59; + res_c11 -= res_c15 * src_a59; + + res_c2 -= res_c7 * src_a58; + res_c10 -= res_c15 * src_a58; + + res_c1 -= res_c7 * src_a57; + res_c9 -= res_c15 * src_a57; + + res_c0 -= res_c7 * src_a56; + res_c8 -= res_c15 * src_a56; + + src_a = LD_SP(a + 48); + SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); + src_a52 = LD_SP(a + 52); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); + src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); + src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); + + res_c6 *= src_a54; + res_c14 *= src_a54; + + res_c5 -= res_c6 * src_a53; + res_c13 -= res_c14 * src_a53; + + res_c4 -= res_c6 * src_a52; + res_c12 -= res_c14 * src_a52; + + res_c3 -= res_c6 * src_a51; + res_c11 -= res_c14 * src_a51; + + res_c2 -= res_c6 * src_a50; + res_c10 -= res_c14 * src_a50; + + res_c1 -= res_c6 * src_a49; + res_c9 -= res_c14 * src_a49; + + res_c0 -= res_c6 * src_a48; + res_c8 -= res_c14 * src_a48; + + src_a = LD_SP(a + 40); + SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); + src_a44 = LD_SP(a + 44); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); + src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); + + res_c5 *= src_a45; + res_c13 *= src_a45; + + res_c4 -= res_c5 * src_a44; + res_c12 -= res_c13 * src_a44; + + res_c3 -= res_c5 * src_a43; + res_c11 -= res_c13 * src_a43; + + res_c2 -= res_c5 * src_a42; + res_c10 -= res_c13 * src_a42; + + res_c1 -= res_c5 * src_a41; + res_c9 -= res_c13 * src_a41; + + res_c0 -= res_c5 * src_a40; + res_c8 -= res_c13 * src_a40; + + src_a = LD_SP(a + 32); + SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); + src_a36 = __msa_cast_to_vector_float(*(a + 36)); + src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + + res_c4 *= src_a36; + res_c12 *= src_a36; + + res_c3 -= res_c4 * src_a35; + res_c11 -= res_c12 * src_a35; + + res_c2 -= res_c4 * src_a34; + res_c10 -= res_c12 * src_a34; + + res_c1 -= res_c4 * src_a33; + res_c9 -= res_c12 * src_a33; + + res_c0 -= res_c4 * src_a32; + res_c8 -= res_c12 * src_a32; + + ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); + ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); + + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, + src_c9, src_c11, src_c13, src_c15); + + ST_SP(src_c1, c + 4); + ST_SP(src_c3, c_nxt1line + 4); + ST_SP(src_c5, c_nxt2line + 4); + ST_SP(src_c7, c_nxt3line + 4); + ST_SP(src_c9, c_nxt4line + 4); + ST_SP(src_c11, c_nxt5line + 4); + ST_SP(src_c13, c_nxt6line + 4); + ST_SP(src_c15, c_nxt7line + 4); + + src_a = LD_SP(a + 24); + SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); + + res_c3 *= src_a27; + res_c11 *= src_a27; + + res_c2 -= res_c3 * src_a26; + res_c10 -= res_c11 * src_a26; + + res_c1 -= res_c3 * src_a25; + res_c9 -= res_c11 * src_a25; + + res_c0 -= res_c3 * src_a24; + res_c8 -= res_c11 * src_a24; + + src_a16 = LD_SP(a + 16); + src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); + src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); + src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); + + res_c2 *= src_a18; + res_c10 *= src_a18; + + res_c1 -= res_c2 * src_a17; + res_c9 -= res_c10 * src_a17; + + res_c0 -= res_c2 * src_a16; + res_c8 -= res_c10 * src_a16; + + src_a9 = __msa_cast_to_vector_float(*(a + 9)); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); + src_a8 = __msa_cast_to_vector_float(*(a + 8)); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c1 *= src_a9; + res_c9 *= src_a9; + + res_c0 -= res_c1 * src_a8; + res_c8 -= res_c9 * src_a8; + + res_c0 *= src_a0; + res_c8 *= src_a0; + + ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); + ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, + src_c8, src_c10, src_c12, src_c14); + + ST_SP(src_c0, c); + ST_SP(src_c2, c_nxt1line); + ST_SP(src_c4, c_nxt2line); + ST_SP(src_c6, c_nxt3line); + ST_SP(src_c8, c_nxt4line); + ST_SP(src_c10, c_nxt5line); + ST_SP(src_c12, c_nxt6line); + ST_SP(src_c14, c_nxt7line); +} + +static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; + v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35; + v4f32 src_a36, src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; + v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; + v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + a -= 64; + b -= 32; + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 60); + SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); + src_a = LD_SP(a + 56); + SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); + + src_a = LD_SP(a + 48); + SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); + src_a52 = LD_SP(a + 52); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); + src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); + src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c5 -= res_c7 * src_a61; + res_c4 -= res_c7 * src_a60; + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + res_c6 *= src_a54; + res_c5 -= res_c6 * src_a53; + res_c4 -= res_c6 * src_a52; + res_c3 -= res_c6 * src_a51; + res_c2 -= res_c6 * src_a50; + res_c1 -= res_c6 * src_a49; + res_c0 -= res_c6 * src_a48; + + src_a = LD_SP(a + 40); + SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); + src_a44 = LD_SP(a + 44); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); + src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); + + res_c5 *= src_a45; + res_c4 -= res_c5 * src_a44; + res_c3 -= res_c5 * src_a43; + res_c2 -= res_c5 * src_a42; + res_c1 -= res_c5 * src_a41; + res_c0 -= res_c5 * src_a40; + + src_a = LD_SP(a + 32); + SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); + src_a36 = __msa_cast_to_vector_float(*(a + 36)); + src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + + res_c4 *= src_a36; + res_c3 -= res_c4 * src_a35; + res_c2 -= res_c4 * src_a34; + res_c1 -= res_c4 * src_a33; + res_c0 -= res_c4 * src_a32; + + src_a = LD_SP(a + 24); + SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); + + res_c3 *= src_a27; + res_c2 -= res_c3 * src_a26; + res_c1 -= res_c3 * src_a25; + res_c0 -= res_c3 * src_a24; + + src_a16 = LD_SP(a + 16); + src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); + src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); + src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); + + res_c2 *= src_a18; + res_c1 -= res_c2 * src_a17; + res_c0 -= res_c2 * src_a16; + + src_a9 = __msa_cast_to_vector_float(*(a + 9)); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); + src_a8 = __msa_cast_to_vector_float(*(a + 8)); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c1 *= src_a9; + res_c0 -= res_c1 * src_a8; + + res_c0 *= src_a0; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + ST_SP4(res_c4, res_c5, res_c6, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + c4_nxt = *(c + 4 + ldc); + c5_nxt = *(c + 5 + ldc); + c6_nxt = *(c + 6 + ldc); + c7_nxt = *(c + 7 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[16]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[2] * bb[0]; + res[3] = aa[3] * bb[0]; + res[4] = aa[4] * bb[0]; + res[5] = aa[5] * bb[0]; + res[6] = aa[6] * bb[0]; + res[7] = aa[7] * bb[0]; + res[8] = aa[0] * bb[1]; + res[9] = aa[1] * bb[1]; + res[10] = aa[2] * bb[1]; + res[11] = aa[3] * bb[1]; + res[12] = aa[4] * bb[1]; + res[13] = aa[5] * bb[1]; + res[14] = aa[6] * bb[1]; + res[15] = aa[7] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 2; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[2] * bb[0]; + res[3] += aa[3] * bb[0]; + res[4] += aa[4] * bb[0]; + res[5] += aa[5] * bb[0]; + res[6] += aa[6] * bb[0]; + res[7] += aa[7] * bb[0]; + res[8] += aa[0] * bb[1]; + res[9] += aa[1] * bb[1]; + res[10] += aa[2] * bb[1]; + res[11] += aa[3] * bb[1]; + res[12] += aa[4] * bb[1]; + res[13] += aa[5] * bb[1]; + res[14] += aa[6] * bb[1]; + res[15] += aa[7] * bb[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c4 -= res[4]; + c5 -= res[5]; + c6 -= res[6]; + c7 -= res[7]; + + c0_nxt -= res[8]; + c1_nxt -= res[9]; + c2_nxt -= res[10]; + c3_nxt -= res[11]; + c4_nxt -= res[12]; + c5_nxt -= res[13]; + c6_nxt -= res[14]; + c7_nxt -= res[15]; + } + + a -= 64; + b -= 16; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + c7_nxt *= a63; + c6 -= c7 * a62; + c6_nxt -= c7_nxt * a62; + c5 -= c7 * a61; + c5_nxt -= c7_nxt * a61; + c4 -= c7 * a60; + c4_nxt -= c7_nxt * a60; + c3 -= c7 * a59; + c3_nxt -= c7_nxt * a59; + c2 -= c7 * a58; + c2_nxt -= c7_nxt * a58; + c1 -= c7 * a57; + c1_nxt -= c7_nxt * a57; + c0 -= c7 * a56; + c0_nxt -= c7_nxt * a56; + + c6 *= a54; + c6_nxt *= a54; + c5 -= c6 * a53; + c5_nxt -= c6_nxt * a53; + c4 -= c6 * a52; + c4_nxt -= c6_nxt * a52; + c3 -= c6 * a51; + c3_nxt -= c6_nxt * a51; + c2 -= c6 * a50; + c2_nxt -= c6_nxt * a50; + c1 -= c6 * a49; + c1_nxt -= c6_nxt * a49; + c0 -= c6 * a48; + c0_nxt -= c6_nxt * a48; + + c5 *= a45; + c5_nxt *= a45; + c4 -= c5 * a44; + c4_nxt -= c5_nxt * a44; + c3 -= c5 * a43; + c3_nxt -= c5_nxt * a43; + c2 -= c5 * a42; + c2_nxt -= c5_nxt * a42; + c1 -= c5 * a41; + c1_nxt -= c5_nxt * a41; + c0 -= c5 * a40; + c0_nxt -= c5_nxt * a40; + + c4 *= a36; + c4_nxt *= a36; + c3 -= c4 * a35; + c3_nxt -= c4_nxt * a35; + c2 -= c4 * a34; + c2_nxt -= c4_nxt * a34; + c1 -= c4 * a33; + c1_nxt -= c4_nxt * a33; + c0 -= c4 * a32; + c0_nxt -= c4_nxt * a32; + + c3 *= a27; + c3_nxt *= a27; + c2 -= c3 * a26; + c2_nxt -= c3_nxt * a26; + c1 -= c3 * a25; + c1_nxt -= c3_nxt * a25; + c0 -= c3 * a24; + c0_nxt -= c3_nxt * a24; + + c2 *= a18; + c2_nxt *= a18; + c1 -= c2 * a17; + c1_nxt -= c2_nxt * a17; + c0 -= c2 * a16; + c0_nxt -= c2_nxt * a16; + + c1 *= a9; + c1_nxt *= a9; + c0 -= c1 * a8; + c0_nxt -= c1_nxt * a8; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + *(b + 8) = c4; + *(b + 9) = c4_nxt; + *(b + 10) = c5; + *(b + 11) = c5_nxt; + *(b + 12) = c6; + *(b + 13) = c6_nxt; + *(b + 14) = c7; + *(b + 15) = c7_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; + *(c + 4 + ldc) = c4_nxt; + *(c + 5 + ldc) = c5_nxt; + *(c + 6 + ldc) = c6_nxt; + *(c + 7 + ldc) = c7_nxt; +} + +static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = aa[0] * bb[0]; + t1 = aa[1] * bb[0]; + t2 = aa[2] * bb[0]; + t3 = aa[3] * bb[0]; + t4 = aa[4] * bb[0]; + t5 = aa[5] * bb[0]; + t6 = aa[6] * bb[0]; + t7 = aa[7] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 1; + + t0 += aa[0] * bb[0]; + t1 += aa[1] * bb[0]; + t2 += aa[2] * bb[0]; + t3 += aa[3] * bb[0]; + t4 += aa[4] * bb[0]; + t5 += aa[5] * bb[0]; + t6 += aa[6] * bb[0]; + t7 += aa[7] * bb[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + c4 -= t4; + c5 -= t5; + c6 -= t6; + c7 -= t7; + } + + a -= 64; + b -= 8; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + + c6 -= c7 * a62; + c6 *= a54; + + c5 -= c7 * a61; + c5 -= c6 * a53; + c5 *= a45; + + c4 -= c7 * a60; + c4 -= c6 * a52; + c4 -= c5 * a44; + c4 *= a36; + + c3 -= c7 * a59; + c3 -= c6 * a51; + c3 -= c5 * a43; + c3 -= c4 * a35; + c3 *= a27; + + c2 -= c7 * a58; + c2 -= c6 * a50; + c2 -= c5 * a42; + c2 -= c4 * a34; + c2 -= c3 * a26; + c2 *= a18; + + c1 -= c7 * a57; + c1 -= c6 * a49; + c1 -= c5 * a41; + c1 -= c4 * a33; + c1 -= c3 * a25; + c1 -= c2 * a17; + c1 *= a9; + + c0 -= c7 * a56; + c0 -= c6 * a48; + c0 -= c5 * a40; + c0 -= c4 * a32; + c0 -= c3 * a24; + c0 -= c2 * a16; + c0 -= c1 * a8; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; +} + +static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; + v4f32 src_a13, src_a14, src_a15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 8; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + a -= 16; + b -= 32; + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 12); + SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); + src_a8 = LD_SP(a + 8); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a5 = __msa_cast_to_vector_float(*(a + 5)); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a4 = __msa_cast_to_vector_float(*(a + 4)); + src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c3 *= src_a15; + res_c7 *= src_a15; + res_c2 -= res_c3 * src_a14; + res_c6 -= res_c7 * src_a14; + res_c1 -= res_c3 * src_a13; + res_c5 -= res_c7 * src_a13; + res_c0 -= res_c3 * src_a12; + res_c4 -= res_c7 * src_a12; + + res_c2 *= src_a10; + res_c6 *= src_a10; + res_c1 -= res_c2 * src_a9; + res_c5 -= res_c6 * src_a9; + res_c0 -= res_c2 * src_a8; + res_c4 -= res_c6 * src_a8; + + res_c1 *= src_a5; + res_c5 *= src_a5; + res_c0 -= res_c1 * src_a4; + res_c4 -= res_c5 * src_a4; + + res_c0 *= src_a0; + res_c4 *= src_a0; + + ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); + ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c4, src_c5, src_c6, src_c7); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; + v4f32 src_a13, src_a14, src_a15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + for (k = ((bk - 1) >> 1); k--;) + { + aa += 4; + bb += 4; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + aa += 4; + bb += 4; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + } + + if ((bk - 1) & 1) + { + aa += 4; + bb += 4; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + a -= 16; + b -= 16; + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + + src_a = LD_SP(a + 12); + SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); + src_a8 = LD_SP(a + 8); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a5 = __msa_cast_to_vector_float(*(a + 5)); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a4 = __msa_cast_to_vector_float(*(a + 4)); + src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c3 *= src_a15; + res_c2 -= res_c3 * src_a14; + res_c1 -= res_c3 * src_a13; + res_c0 -= res_c3 * src_a12; + + res_c2 *= src_a10; + res_c1 -= res_c2 * src_a9; + res_c0 -= res_c2 * src_a8; + + res_c1 *= src_a5; + res_c0 -= res_c1 * src_a4; + + res_c0 *= src_a0; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; + FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[8]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[2] * bb[0]; + res[3] = aa[3] * bb[0]; + res[4] = aa[0] * bb[1]; + res[5] = aa[1] * bb[1]; + res[6] = aa[2] * bb[1]; + res[7] = aa[3] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 2; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[2] * bb[0]; + res[3] += aa[3] * bb[0]; + res[4] += aa[0] * bb[1]; + res[5] += aa[1] * bb[1]; + res[6] += aa[2] * bb[1]; + res[7] += aa[3] * bb[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c0_nxt -= res[4]; + c1_nxt -= res[5]; + c2_nxt -= res[6]; + c3_nxt -= res[7]; + } + + a -= 16; + b -= 8; + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + c3_nxt *= a15; + + c2 -= c3 * a14; + c2_nxt -= c3_nxt * a14; + + c2 *= a10; + c2_nxt *= a10; + + c1 -= c3 * a13; + c1_nxt -= c3_nxt * a13; + + c1 -= c2 * a9; + c1_nxt -= c2_nxt * a9; + + c1 *= a5; + c1_nxt *= a5; + + c0 -= c3 * a12; + c0_nxt -= c3_nxt * a12; + + c0 -= c2 * a8; + c0_nxt -= c2_nxt * a8; + + c0 -= c1 * a4; + c0_nxt -= c1_nxt * a4; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; +} + +static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3; + + t0 = aa[0] * bb[0]; + t1 = aa[1] * bb[0]; + t2 = aa[2] * bb[0]; + t3 = aa[3] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 1; + + t0 += aa[0] * bb[0]; + t1 += aa[1] * bb[0]; + t2 += aa[2] * bb[0]; + t3 += aa[3] * bb[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + } + + a -= 16; + b -= 4; + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + + c2 -= c3 * a14; + c2 *= a10; + + c1 -= c3 * a13; + c1 -= c2 * a9; + c1 *= a5; + + c0 -= c3 * a12; + c0 -= c2 * a8; + c0 -= c1 * a4; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3; + FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + FLOAT c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[16]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + res[8] = aa[0] * bb[4]; + res[9] = aa[1] * bb[4]; + res[10] = aa[0] * bb[5]; + res[11] = aa[1] * bb[5]; + res[12] = aa[0] * bb[6]; + res[13] = aa[1] * bb[6]; + res[14] = aa[0] * bb[7]; + res[15] = aa[1] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 8; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + res[8] += aa[0] * bb[4]; + res[9] += aa[1] * bb[4]; + res[10] += aa[0] * bb[5]; + res[11] += aa[1] * bb[5]; + res[12] += aa[0] * bb[6]; + res[13] += aa[1] * bb[6]; + res[14] += aa[0] * bb[7]; + res[15] += aa[1] * bb[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + } + + a -= 4; + b -= 16; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + c1_nxt4 *= a3; + c1_nxt5 *= a3; + c1_nxt6 *= a3; + c1_nxt7 *= a3; + + c0 -= c1 * a2; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt3 -= c1_nxt3 * a2; + c0_nxt4 -= c1_nxt4 * a2; + c0_nxt5 -= c1_nxt5 * a2; + c0_nxt6 -= c1_nxt6 * a2; + c0_nxt7 -= c1_nxt7 * a2; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + c0_nxt4 *= a0; + c0_nxt5 *= a0; + c0_nxt6 *= a0; + c0_nxt7 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c0_nxt4; + *(b + 5) = c0_nxt5; + *(b + 6) = c0_nxt6; + *(b + 7) = c0_nxt7; + *(b + 8) = c1; + *(b + 9) = c1_nxt1; + *(b + 10) = c1_nxt2; + *(b + 11) = c1_nxt3; + *(b + 12) = c1_nxt4; + *(b + 13) = c1_nxt5; + *(b + 14) = c1_nxt6; + *(b + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[8]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 4; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + } + + a -= 4; + b -= 8; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + + c0 -= c1 * a2; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt3 -= c1_nxt3 * a2; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1, res2, res3; + + res0 = aa[0] * bb[0]; + res1 = aa[1] * bb[0]; + res2 = aa[0] * bb[1]; + res3 = aa[1] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 2; + + res0 += aa[0] * bb[0]; + res1 += aa[1] * bb[0]; + res2 += aa[0] * bb[1]; + res3 += aa[1] * bb[1]; + } + + c0 -= res0; + c1 -= res1; + c0_nxt -= res2; + c1_nxt -= res3; + } + + a -= 4; + b -= 4; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt *= a3; + + c0 -= c1 * a2; + c0_nxt -= c1_nxt * a2; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1; + + res0 = aa[0] * bb[0]; + res1 = aa[1] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 1; + + res0 += aa[0] * bb[0]; + res1 += aa[1] * bb[0]; + } + + c0 -= res0; + c1 -= res1; + } + + a -= 4; + b -= 2; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + + c0 -= c1 * a2; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + if (bk > 0) + { + FLOAT *aa = a, *bb = b; + BLASLONG k; + FLOAT r0, r1, r2, r3, r4, r5, r6, r7; + + r0 = aa[0] * bb[0]; + r1 = aa[0] * bb[1]; + r2 = aa[0] * bb[2]; + r3 = aa[0] * bb[3]; + r4 = aa[0] * bb[4]; + r5 = aa[0] * bb[5]; + r6 = aa[0] * bb[6]; + r7 = aa[0] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 8; + + r0 += aa[0] * bb[0]; + r1 += aa[0] * bb[1]; + r2 += aa[0] * bb[2]; + r3 += aa[0] * bb[3]; + r4 += aa[0] * bb[4]; + r5 += aa[0] * bb[5]; + r6 += aa[0] * bb[6]; + r7 += aa[0] * bb[7]; + } + + c0 -= r0; + c1 -= r1; + c2 -= r2; + c3 -= r3; + c4 -= r4; + c5 -= r5; + c6 -= r6; + c7 -= r7; + } + + a0 = *(a - 1); + + c0 *= a0; + c1 *= a0; + c2 *= a0; + c3 *= a0; + c4 *= a0; + c5 *= a0; + c6 *= a0; + c7 *= a0; + + *(b - 8) = c0; + *(b - 7) = c1; + *(b - 6) = c2; + *(b - 5) = c3; + *(b - 4) = c4; + *(b - 3) = c5; + *(b - 2) = c6; + *(b - 1) = c7; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT a0, c0, c1, c2, c3; + + a0 = *(a - 1); + + c0 = *(c + 0 * ldc) * a0; + c1 = *(c + 1 * ldc) * a0; + c2 = *(c + 2 * ldc) * a0; + c3 = *(c + 3 * ldc) * a0; + + *(b - 4) = c0; + *(b - 3) = c1; + *(b - 2) = c2; + *(b - 1) = c3; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT a0, c0, c1; + + a0 = *(a - 1); + + c0 = *(c + 0 * ldc) * a0; + c1 = *(c + 1 * ldc) * a0; + + *(b - 2) = c0; + *(b - 1) = c1; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; +} + +static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + *c *= *(a - 1); + *(b - 1) = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + for (j = (n >> 3); j--;) + { + kk = m + offset; + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x8_ln_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 2) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, k -kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x1_ln_msa(aa, b + kk, cc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x1_ln_msa(aa, b + kk, cc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x1_ln_msa(aa, b + kk, cc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c new file mode 100644 index 000000000..0c61d3618 --- /dev/null +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -0,0 +1,2099 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + a += 8; + b += 8; + + for (k = (bk - 1); k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + + a += 8; + b += 8; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, + res_c8, res_c9, res_c10, res_c11); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, + res_c12, res_c13, res_c14, res_c15); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a = LD_SP(a + 4); + SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); + + res_c0 *= src_a0; + res_c8 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c9 -= res_c8 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c10 -= res_c8 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c11 -= res_c8 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c12 -= res_c8 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c13 -= res_c8 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c14 -= res_c8 * src_a6; + res_c7 -= res_c0 * src_a7; + res_c15 -= res_c8 * src_a7; + + src_a = LD_SP(a + 9); + SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); + src_a13 = LD_SP(a + 13); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); + src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); + src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); + + res_c1 *= src_a9; + res_c9 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c10 -= res_c9 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c11 -= res_c9 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c12 -= res_c9 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c13 -= res_c9 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c14 -= res_c9 * src_a14; + res_c7 -= res_c1 * src_a15; + res_c15 -= res_c9 * src_a15; + + src_a = LD_SP(a + 18); + SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); + src_a22 = LD_SP(a + 22); + src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); + src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); + + res_c2 *= src_a18; + res_c10 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c11 -= res_c10 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c12 -= res_c10 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c13 -= res_c10 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c14 -= res_c10 * src_a22; + res_c7 -= res_c2 * src_a23; + res_c15 -= res_c10 * src_a23; + + src_a = LD_SP(a + 27); + SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); + src_a31 = __msa_cast_to_vector_float(*(a + 31)); + src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + + res_c3 *= src_a27; + res_c11 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c12 -= res_c11 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c13 -= res_c11 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c14 -= res_c11 * src_a30; + res_c7 -= res_c3 * src_a31; + res_c15 -= res_c11 * src_a31; + + ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); + ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, + src_c8, src_c10, src_c12, src_c14); + + ST_SP(src_c0, c); + ST_SP(src_c2, c_nxt1line); + ST_SP(src_c4, c_nxt2line); + ST_SP(src_c6, c_nxt3line); + ST_SP(src_c8, c_nxt4line); + ST_SP(src_c10, c_nxt5line); + ST_SP(src_c12, c_nxt6line); + ST_SP(src_c14, c_nxt7line); + + src_a = LD_SP(a + 36); + SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); + + res_c4 *= src_a36; + res_c12 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c13 -= res_c12 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c14 -= res_c12 * src_a38; + res_c7 -= res_c4 * src_a39; + res_c15 -= res_c12 * src_a39; + + src_a45 = LD_SP(a + 45); + src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); + src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); + + res_c5 *= src_a45; + res_c13 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c14 -= res_c13 * src_a46; + res_c7 -= res_c5 * src_a47; + res_c15 -= res_c13 * src_a47; + + src_a54 = __msa_cast_to_vector_float(*(a + 54)); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); + src_a55 = __msa_cast_to_vector_float(*(a + 55)); + src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); + src_a63 = __msa_cast_to_vector_float(*(a + 63)); + src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + + res_c6 *= src_a54; + res_c14 *= src_a54; + res_c7 -= res_c6 * src_a55; + res_c15 -= res_c14 * src_a55; + + res_c7 *= src_a63; + res_c15 *= src_a63; + + ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); + ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); + + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, + src_c9, src_c11, src_c13, src_c15); + + ST_SP(src_c1, c + 4); + ST_SP(src_c3, c_nxt1line + 4); + ST_SP(src_c5, c_nxt2line + 4); + ST_SP(src_c7, c_nxt3line + 4); + ST_SP(src_c9, c_nxt4line + 4); + ST_SP(src_c11, c_nxt5line + 4); + ST_SP(src_c13, c_nxt6line + 4); + ST_SP(src_c15, c_nxt7line + 4); +} + +static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + a += 8; + b += 4; + + for (k = (bk - 1); k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a = LD_SP(a + 4); + SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + src_a = LD_SP(a + 9); + SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); + src_a13 = LD_SP(a + 13); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); + src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); + src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + src_a = LD_SP(a + 18); + SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); + src_a22 = LD_SP(a + 22); + src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); + src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + src_a = LD_SP(a + 27); + SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); + src_a31 = __msa_cast_to_vector_float(*(a + 31)); + src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + src_a = LD_SP(a + 36); + SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + src_a45 = LD_SP(a + 45); + src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); + src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + src_a54 = __msa_cast_to_vector_float(*(a + 54)); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); + src_a55 = __msa_cast_to_vector_float(*(a + 55)); + src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); + src_a63 = __msa_cast_to_vector_float(*(a + 63)); + src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + res_c7 *= src_a63; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + b += 16; + ST_SP4(res_c4, res_c5, res_c6, res_c7, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + c4_nxt = *(c + 4 + ldc); + c5_nxt = *(c + 5 + ldc); + c6_nxt = *(c + 6 + ldc); + c7_nxt = *(c + 7 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[16]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[2] * b[0]; + res[3] = a[3] * b[0]; + res[4] = a[4] * b[0]; + res[5] = a[5] * b[0]; + res[6] = a[6] * b[0]; + res[7] = a[7] * b[0]; + res[8] = a[0] * b[1]; + res[9] = a[1] * b[1]; + res[10] = a[2] * b[1]; + res[11] = a[3] * b[1]; + res[12] = a[4] * b[1]; + res[13] = a[5] * b[1]; + res[14] = a[6] * b[1]; + res[15] = a[7] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 8; + b += 2; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[2] * b[0]; + res[3] += a[3] * b[0]; + res[4] += a[4] * b[0]; + res[5] += a[5] * b[0]; + res[6] += a[6] * b[0]; + res[7] += a[7] * b[0]; + res[8] += a[0] * b[1]; + res[9] += a[1] * b[1]; + res[10] += a[2] * b[1]; + res[11] += a[3] * b[1]; + res[12] += a[4] * b[1]; + res[13] += a[5] * b[1]; + res[14] += a[6] * b[1]; + res[15] += a[7] * b[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c4 -= res[4]; + c5 -= res[5]; + c6 -= res[6]; + c7 -= res[7]; + c0_nxt -= res[8]; + c1_nxt -= res[9]; + c2_nxt -= res[10]; + c3_nxt -= res[11]; + c4_nxt -= res[12]; + c5_nxt -= res[13]; + c6_nxt -= res[14]; + c7_nxt -= res[15]; + + a += 8; + b += 2; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + c0_nxt *= a0; + + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + c1 *= a9; + c1_nxt *= a9; + + c2 -= c0 * a2; + c2_nxt -= c0_nxt * a2; + c2 -= c1 * a10; + c2_nxt -= c1_nxt * a10; + c2 *= a18; + c2_nxt *= a18; + + c3 -= c0 * a3; + c3_nxt -= c0_nxt * a3; + c3 -= c1 * a11; + c3_nxt -= c1_nxt * a11; + c3 -= c2 * a19; + c3_nxt -= c2_nxt * a19; + c3 *= a27; + c3_nxt *= a27; + + c4 -= c0 * a4; + c4_nxt -= c0_nxt * a4; + c4 -= c1 * a12; + c4_nxt -= c1_nxt * a12; + c4 -= c2 * a20; + c4_nxt -= c2_nxt * a20; + c4 -= c3 * a28; + c4_nxt -= c3_nxt * a28; + c4 *= a36; + c4_nxt *= a36; + + c5 -= c0 * a5; + c5_nxt -= c0_nxt * a5; + c5 -= c1 * a13; + c5_nxt -= c1_nxt * a13; + c5 -= c2 * a21; + c5_nxt -= c2_nxt * a21; + c5 -= c3 * a29; + c5_nxt -= c3_nxt * a29; + c5 -= c4 * a37; + c5_nxt -= c4_nxt * a37; + c5 *= a45; + c5_nxt *= a45; + + c6 -= c0 * a6; + c6_nxt -= c0_nxt * a6; + c6 -= c1 * a14; + c6_nxt -= c1_nxt * a14; + c6 -= c2 * a22; + c6_nxt -= c2_nxt * a22; + c6 -= c3 * a30; + c6_nxt -= c3_nxt * a30; + c6 -= c4 * a38; + c6_nxt -= c4_nxt * a38; + c6 -= c5 * a46; + c6_nxt -= c5_nxt * a46; + c6 *= a54; + c6_nxt *= a54; + + c7 -= c0 * a7; + c7_nxt -= c0_nxt * a7; + c7 -= c1 * a15; + c7_nxt -= c1_nxt * a15; + c7 -= c2 * a23; + c7_nxt -= c2_nxt * a23; + c7 -= c3 * a31; + c7_nxt -= c3_nxt * a31; + c7 -= c4 * a39; + c7_nxt -= c4_nxt * a39; + c7 -= c5 * a47; + c7_nxt -= c5_nxt * a47; + c7 -= c6 * a55; + c7_nxt -= c6_nxt * a55; + c7 *= a63; + c7_nxt *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; + *(c + 4 + ldc) = c4_nxt; + *(c + 5 + ldc) = c5_nxt; + *(c + 6 + ldc) = c6_nxt; + *(c + 7 + ldc) = c7_nxt; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + *(b + 8) = c4; + *(b + 9) = c4_nxt; + *(b + 10) = c5; + *(b + 11) = c5_nxt; + *(b + 12) = c6; + *(b + 13) = c6_nxt; + *(b + 14) = c7; + *(b + 15) = c7_nxt; +} + +static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7; + + a0 = a[0] * b[0]; + a1 = a[1] * b[0]; + a2 = a[2] * b[0]; + a3 = a[3] * b[0]; + a4 = a[4] * b[0]; + a5 = a[5] * b[0]; + a6 = a[6] * b[0]; + a7 = a[7] * b[0]; + + for (i = (bk - 1); i--; ) + { + a += 8; + b += 1; + + a0 += a[0] * b[0]; + a1 += a[1] * b[0]; + a2 += a[2] * b[0]; + a3 += a[3] * b[0]; + a4 += a[4] * b[0]; + a5 += a[5] * b[0]; + a6 += a[6] * b[0]; + a7 += a[7] * b[0]; + } + + c0 -= a0; + c1 -= a1; + c2 -= a2; + c3 -= a3; + c4 -= a4; + c5 -= a5; + c6 -= a6; + c7 -= a7; + + a += 8; + b += 1; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a9; + + c2 -= c0 * a2; + c2 -= c1 * a10; + c2 *= a18; + + c3 -= c0 * a3; + c3 -= c1 * a11; + c3 -= c2 * a19; + c3 *= a27; + + c4 -= c0 * a4; + c4 -= c1 * a12; + c4 -= c2 * a20; + c4 -= c3 * a28; + c4 *= a36; + + c5 -= c0 * a5; + c5 -= c1 * a13; + c5 -= c2 * a21; + c5 -= c3 * a29; + c5 -= c4 * a37; + c5 *= a45; + + c6 -= c0 * a6; + c6 -= c1 * a14; + c6 -= c2 * a22; + c6 -= c3 * a30; + c6 -= c4 * a38; + c6 -= c5 * a46; + c6 *= a54; + + c7 -= c0 * a7; + c7 -= c1 * a15; + c7 -= c2 * a23; + c7 -= c3 * a31; + c7 -= c4 * a39; + c7 -= c5 * a47; + c7 -= c6 * a55; + c7 *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; +} + +static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v4f32 src_a10, src_a11, src_a15, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + a += 4; + b += 8; + + for (k = (bk - 1); k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a5 = LD_SP(a + 5); + src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); + src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a10 = __msa_cast_to_vector_float(*(a + 10)); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); + src_a11 = __msa_cast_to_vector_float(*(a + 11)); + src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); + src_a15 = __msa_cast_to_vector_float(*(a + 15)); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + + res_c0 *= src_a0; + res_c4 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c5 -= res_c4 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c6 -= res_c4 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c7 -= res_c4 * src_a3; + + res_c1 *= src_a5; + res_c5 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c6 -= res_c5 * src_a6; + res_c3 -= res_c1 * src_a7; + res_c7 -= res_c5 * src_a7; + + res_c2 *= src_a10; + res_c6 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c7 -= res_c6 * src_a11; + + res_c3 *= src_a15; + res_c7 *= src_a15; + + ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); + ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c4, src_c5, src_c6, src_c7); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v4f32 src_a10, src_a11, src_a15, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + a += 4; + b += 4; + + for (k = (bk - 1) >> 1; k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + if ((bk - 1) & 1) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a5 = LD_SP(a + 5); + src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); + src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a10 = __msa_cast_to_vector_float(*(a + 10)); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); + src_a11 = __msa_cast_to_vector_float(*(a + 11)); + src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); + src_a15 = __msa_cast_to_vector_float(*(a + 15)); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + + res_c3 *= src_a15; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[8]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[2] * b[0]; + res[3] = a[3] * b[0]; + res[4] = a[0] * b[1]; + res[5] = a[1] * b[1]; + res[6] = a[2] * b[1]; + res[7] = a[3] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 4; + b += 2; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[2] * b[0]; + res[3] += a[3] * b[0]; + res[4] += a[0] * b[1]; + res[5] += a[1] * b[1]; + res[6] += a[2] * b[1]; + res[7] += a[3] * b[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c0_nxt -= res[4]; + c1_nxt -= res[5]; + c2_nxt -= res[6]; + c3_nxt -= res[7]; + + a += 4; + b += 2; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + c0_nxt *= a0; + + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + + c1 *= a5; + c1_nxt *= a5; + + c2 -= c0 * a2; + c2_nxt -= c0_nxt * a2; + + c2 -= c1 * a6; + c2_nxt -= c1_nxt * a6; + + c2 *= a10; + c2_nxt *= a10; + + c3 -= c0 * a3; + c3_nxt -= c0_nxt * a3; + + c3 -= c1 * a7; + c3_nxt -= c1_nxt * a7; + + c3 -= c2 * a11; + c3_nxt -= c2_nxt * a11; + + c3 *= a15; + c3_nxt *= a15; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; +} + +static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG k; + FLOAT t0, t1, t2, t3; + + t0 = a[0] * b[0]; + t1 = a[1] * b[0]; + t2 = a[2] * b[0]; + t3 = a[3] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 4; + b += 1; + + t0 += a[0] * b[0]; + t1 += a[1] * b[0]; + t2 += a[2] * b[0]; + t3 += a[3] * b[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + + a += 4; + b += 1; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a5; + + c2 -= c0 * a2; + c2 -= c1 * a6; + c2 *= a10; + + c3 -= c0 * a3; + c3 -= c1 * a7; + c3 -= c2 * a11; + c3 *= a15; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2; + FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5; + FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[16]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + res[8] = a[0] * b[4]; + res[9] = a[1] * b[4]; + res[10] = a[0] * b[5]; + res[11] = a[1] * b[5]; + res[12] = a[0] * b[6]; + res[13] = a[1] * b[6]; + res[14] = a[0] * b[7]; + res[15] = a[1] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 8; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + res[8] += a[0] * b[4]; + res[9] += a[1] * b[4]; + res[10] += a[0] * b[5]; + res[11] += a[1] * b[5]; + res[12] += a[0] * b[6]; + res[13] += a[1] * b[6]; + res[14] += a[0] * b[7]; + res[15] += a[1] * b[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + + a += 2; + b += 8; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 = c0 * a0; + c1 = (c1 - c0 * a1) * a3; + + c0_nxt1 = c0_nxt1 * a0; + c1_nxt1 = (c1_nxt1 - c0_nxt1 * a1) * a3; + + c0_nxt2 = c0_nxt2 * a0; + c1_nxt2 = (c1_nxt2 - c0_nxt2 * a1) * a3; + + c0_nxt3 = c0_nxt3 * a0; + c1_nxt3 = (c1_nxt3 - c0_nxt3 * a1) * a3; + + c0_nxt4 = c0_nxt4 * a0; + c1_nxt4 = (c1_nxt4 - c0_nxt4 * a1) * a3; + + c0_nxt5 = c0_nxt5 * a0; + c1_nxt5 = (c1_nxt5 - c0_nxt5 * a1) * a3; + + c0_nxt6 = c0_nxt6 * a0; + c1_nxt6 = (c1_nxt6 - c0_nxt6 * a1) * a3; + + c0_nxt7 = c0_nxt7 * a0; + c1_nxt7 = (c1_nxt7 - c0_nxt7 * a1) * a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c0_nxt4; + *(b + 5) = c0_nxt5; + *(b + 6) = c0_nxt6; + *(b + 7) = c0_nxt7; + *(b + 8) = c1; + *(b + 9) = c1_nxt1; + *(b + 10) = c1_nxt2; + *(b + 11) = c1_nxt3; + *(b + 12) = c1_nxt4; + *(b + 13) = c1_nxt5; + *(b + 14) = c1_nxt6; + *(b + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[8]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 4; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + + a += 2; + b += 4; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + c1 -= c0 * a1; + c1_nxt1 -= c0_nxt1 * a1; + c1_nxt2 -= c0_nxt2 * a1; + c1_nxt3 -= c0_nxt3 * a1; + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1, res2, res3; + + res0 = a[0] * b[0]; + res1 = a[1] * b[0]; + res2 = a[0] * b[1]; + res3 = a[1] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 2; + + res0 += a[0] * b[0]; + res1 += a[1] * b[0]; + res2 += a[0] * b[1]; + res3 += a[1] * b[1]; + } + + c0 -= res0; + c1 -= res1; + + c0_nxt -= res2; + c1_nxt -= res3; + + a += 2; + b += 2; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c0_nxt *= a0; + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + c1 *= a3; + c1_nxt *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[1] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 1; + + res0 += a[0] * b[0]; + res1 += a[1] * b[0]; + } + + c0 -= res0; + c1 -= res1; + + a += 2; + b += 1; + } + + c0 *= *(a + 0); + + c1 -= c0 * *(a + 1); + c1 *= *(a + 3); + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG k; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = a[0] * b[0]; + c1 = a[0] * b[1]; + c2 = a[0] * b[2]; + c3 = a[0] * b[3]; + c4 = a[0] * b[4]; + c5 = a[0] * b[5]; + c6 = a[0] * b[6]; + c7 = a[0] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 8; + + c0 += a[0] * b[0]; + c1 += a[0] * b[1]; + c2 += a[0] * b[2]; + c3 += a[0] * b[3]; + c4 += a[0] * b[4]; + c5 += a[0] * b[5]; + c6 += a[0] * b[6]; + c7 += a[0] * b[7]; + } + + *(c + 0 * ldc) -= c0; + *(c + 1 * ldc) -= c1; + *(c + 2 * ldc) -= c2; + *(c + 3 * ldc) -= c3; + *(c + 4 * ldc) -= c4; + *(c + 5 * ldc) -= c5; + *(c + 6 * ldc) -= c6; + *(c + 7 * ldc) -= c7; + + a += 1; + b += 8; + } + + *c *= *a; + *(c + ldc) *= *a; + *(c + 2 * ldc) *= *a; + *(c + 3 * ldc) *= *a; + *(c + 4 * ldc) *= *a; + *(c + 5 * ldc) *= *a; + *(c + 6 * ldc) *= *a; + *(c + 7 * ldc) *= *a; + + *b = *c; + *(b + 1) = *(c + ldc); + *(b + 2) = *(c + 2 * ldc); + *(b + 3) = *(c + 3 * ldc); + *(b + 4) = *(c + 4 * ldc); + *(b + 5) = *(c + 5 * ldc); + *(b + 6) = *(c + 6 * ldc); + *(b + 7) = *(c + 7 * ldc); +} + +static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1, res2, res3; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + res2 = a[0] * b[2]; + res3 = a[0] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 4; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + res2 += a[0] * b[2]; + res3 += a[0] * b[3]; + } + + c0 -= res0; + c1 -= res1; + c2 -= res2; + c3 -= res3; + a += 1; + b += 4; + } + + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; + + *c = c0; + *(c + ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + + *b = *c; + *(b + 1) = *(c + ldc); + *(b + 2) = *(c + 2 * ldc); + *(b + 3) = *(c + 3 * ldc); +} + +static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1; + + c0 = *c; + c1 = *(c + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 2; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + } + + c0 -= res0; + c1 -= res1; + + a += 1; + b += 2; + } + + *c = c0 * *a; + *(c + ldc) = c1 * *a; + + *b = *c; + *(b + 1) = *(c + ldc); +} + +static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk) + { + BLASLONG k; + FLOAT res; + + res = a[0] * b[0]; + + for (k = (bk - 1); k--;) + { + a++; + b++; + + res += a[0] * b[0]; + } + + *c -= res; + + a++; + b++; + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + for (j = (n >> 3); j--;) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x8_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x4_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 2) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x2_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_lt_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_lt_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x1_lt_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x1_lt_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c new file mode 100644 index 000000000..04bca1b12 --- /dev/null +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -0,0 +1,2162 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; + v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; + v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; + v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + a += 8; + b += 8; + + for (k = (bk - 1); k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + + a += 8; + b += 8; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); + + src_b = LD_SP(b + 9); + SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); + src_b13 = LD_SP(b + 13); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); + src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); + src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + src_c8 -= src_c0 * src_b4; + src_c9 -= src_c1 * src_b4; + src_c10 -= src_c0 * src_b5; + src_c11 -= src_c1 * src_b5; + src_c12 -= src_c0 * src_b6; + src_c13 -= src_c1 * src_b6; + src_c14 -= src_c0 * src_b7; + src_c15 -= src_c1 * src_b7; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + + src_c2 *= src_b9; + src_c3 *= src_b9; + src_c4 -= src_c2 * src_b10; + src_c5 -= src_c3 * src_b10; + src_c6 -= src_c2 * src_b11; + src_c7 -= src_c3 * src_b11; + src_c8 -= src_c2 * src_b12; + src_c9 -= src_c3 * src_b12; + src_c10 -= src_c2 * src_b13; + src_c11 -= src_c3 * src_b13; + src_c12 -= src_c2 * src_b14; + src_c13 -= src_c3 * src_b14; + src_c14 -= src_c2 * src_b15; + src_c15 -= src_c3 * src_b15; + + ST_SP2(src_c2, src_c3, a + 8, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + + src_b = LD_SP(b + 18); + SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); + src_b22 = LD_SP(b + 22); + src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); + src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); + + src_b = LD_SP(b + 27); + SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); + src_b31 = __msa_cast_to_vector_float(*(b + 31)); + src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + + src_c4 *= src_b18; + src_c5 *= src_b18; + src_c6 -= src_c4 * src_b19; + src_c7 -= src_c5 * src_b19; + src_c8 -= src_c4 * src_b20; + src_c9 -= src_c5 * src_b20; + src_c10 -= src_c4 * src_b21; + src_c11 -= src_c5 * src_b21; + src_c12 -= src_c4 * src_b22; + src_c13 -= src_c5 * src_b22; + src_c14 -= src_c4 * src_b23; + src_c15 -= src_c5 * src_b23; + + ST_SP2(src_c4, src_c5, a + 16, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + + src_c6 *= src_b27; + src_c7 *= src_b27; + src_c8 -= src_c6 * src_b28; + src_c9 -= src_c7 * src_b28; + src_c10 -= src_c6 * src_b29; + src_c11 -= src_c7 * src_b29; + src_c12 -= src_c6 * src_b30; + src_c13 -= src_c7 * src_b30; + src_c14 -= src_c6 * src_b31; + src_c15 -= src_c7 * src_b31; + + ST_SP2(src_c6, src_c7, a + 24, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); + + src_b = LD_SP(b + 36); + SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); + + src_b45 = LD_SP(b + 45); + src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); + src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); + + src_b54 = __msa_cast_to_vector_float(*(b + 54)); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); + src_b55 = __msa_cast_to_vector_float(*(b + 55)); + src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); + src_b63 = __msa_cast_to_vector_float(*(b + 63)); + src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + + src_c8 *= src_b36; + src_c9 *= src_b36; + src_c10 -= src_c8 * src_b37; + src_c11 -= src_c9 * src_b37; + src_c12 -= src_c8 * src_b38; + src_c13 -= src_c9 * src_b38; + src_c14 -= src_c8 * src_b39; + src_c15 -= src_c9 * src_b39; + + ST_SP2(src_c8, src_c9, a + 32, 4); + ST_SP2(src_c8, src_c9, c_nxt4line, 4); + + src_c10 *= src_b45; + src_c11 *= src_b45; + src_c12 -= src_c10 * src_b46; + src_c13 -= src_c11 * src_b46; + src_c14 -= src_c10 * src_b47; + src_c15 -= src_c11 * src_b47; + + ST_SP2(src_c10, src_c11, a + 40, 4); + ST_SP2(src_c10, src_c11, c_nxt5line, 4); + + src_c12 *= src_b54; + src_c13 *= src_b54; + src_c14 -= src_c12 * src_b55; + src_c15 -= src_c13 * src_b55; + + ST_SP2(src_c12, src_c13, a + 48, 4); + ST_SP2(src_c12, src_c13, c_nxt6line, 4); + + src_c14 *= src_b63; + src_c15 *= src_b63; + + ST_SP2(src_c14, src_c15, a + 56, 4); + ST_SP2(src_c14, src_c15, c_nxt7line, 4); +} + +static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v4f32 src_b10, src_b11, src_b15, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + a += 8; + b += 4; + + for (k = (bk - 1) / 2; k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + } + + if ((bk - 1) & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b5 = LD_SP(b + 5); + src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); + src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b10 = __msa_cast_to_vector_float(*(b + 10)); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); + src_b11 = __msa_cast_to_vector_float(*(b + 11)); + src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); + src_b15 = __msa_cast_to_vector_float(*(b + 15)); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + + src_c2 *= src_b5; + src_c3 *= src_b5; + src_c4 -= src_c2 * src_b6; + src_c5 -= src_c3 * src_b6; + src_c6 -= src_c2 * src_b7; + src_c7 -= src_c3 * src_b7; + + src_c4 *= src_b10; + src_c5 *= src_b10; + src_c6 -= src_c4 * src_b11; + src_c7 -= src_c5 * src_b11; + + src_c6 *= src_b15; + src_c7 *= src_b15; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1, res2, res3; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + + a += 8; + b += 2; + + for (k = (bk - 1) / 2; k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + a += 8; + b += 2; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + a += 8; + b += 2; + } + + if ((bk - 1) & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + a += 8; + b += 2; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + } + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c2 *= src_b3; + src_c3 *= src_b3; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + + a += 8; + b += 1; + + for (k = (bk - 1) >> 2; k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + } + + if ((bk - 1) & 3) + { + if ((bk - 1) & 2) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + } + + if ((bk - 1) & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + } + } + + LD_SP2(c, 4, src_c0, src_c1); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + } + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); +} + +static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; + v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; + v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; + v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + a += 4; + b += 8; + + for (k = (bk - 1) / 2; k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + } + + if ((bk - 1) & 1) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); + + src_b = LD_SP(b + 9); + SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); + src_b13 = LD_SP(b + 13); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); + src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); + src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); + + src_b = LD_SP(b + 18); + SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); + src_b22 = LD_SP(b + 22); + src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); + src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); + + src_b = LD_SP(b + 27); + SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); + src_b31 = __msa_cast_to_vector_float(*(b + 31)); + src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + + src_b = LD_SP(b + 36); + SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); + + src_b45 = LD_SP(b + 45); + src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); + src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); + + src_b54 = __msa_cast_to_vector_float(*(b + 54)); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); + src_b55 = __msa_cast_to_vector_float(*(b + 55)); + src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); + src_b63 = __msa_cast_to_vector_float(*(b + 63)); + src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c2 -= src_c0 * src_b2; + src_c3 -= src_c0 * src_b3; + src_c4 -= src_c0 * src_b4; + src_c5 -= src_c0 * src_b5; + src_c6 -= src_c0 * src_b6; + src_c7 -= src_c0 * src_b7; + + src_c1 *= src_b9; + src_c2 -= src_c1 * src_b10; + src_c3 -= src_c1 * src_b11; + src_c4 -= src_c1 * src_b12; + src_c5 -= src_c1 * src_b13; + src_c6 -= src_c1 * src_b14; + src_c7 -= src_c1 * src_b15; + + src_c2 *= src_b18; + src_c3 -= src_c2 * src_b19; + src_c4 -= src_c2 * src_b20; + src_c5 -= src_c2 * src_b21; + src_c6 -= src_c2 * src_b22; + src_c7 -= src_c2 * src_b23; + + src_c3 *= src_b27; + src_c4 -= src_c3 * src_b28; + src_c5 -= src_c3 * src_b29; + src_c6 -= src_c3 * src_b30; + src_c7 -= src_c3 * src_b31; + + src_c4 *= src_b36; + src_c5 -= src_c4 * src_b37; + src_c6 -= src_c4 * src_b38; + src_c7 -= src_c4 * src_b39; + + src_c5 *= src_b45; + src_c6 -= src_c5 * src_b46; + src_c7 -= src_c5 * src_b47; + + src_c6 *= src_b54; + src_c7 -= src_c6 * src_b55; + + src_c7 *= src_b63; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3; + v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, res0, res1, res2, res3; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + a += 4; + b += 4; + + for (k = ((bk - 1) >> 1); k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + if ((bk - 1) & 1) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b5 = LD_SP(b + 5); + src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); + src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b10 = __msa_cast_to_vector_float(*(b + 10)); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); + src_b11 = __msa_cast_to_vector_float(*(b + 11)); + src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); + src_b15 = __msa_cast_to_vector_float(*(b + 15)); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c2 -= src_c0 * src_b2; + src_c3 -= src_c0 * src_b3; + + src_c1 *= src_b5; + src_c2 -= src_c1 * src_b6; + src_c3 -= src_c1 * src_b7; + + src_c2 *= src_b10; + src_c3 -= src_c2 * src_b11; + + src_c3 *= src_b15; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0, src_b1, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a, res0, res1; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a * src_b0; + res1 = src_a * src_b1; + + a += 4; + b += 2; + + for (k = ((bk - 1) >> 1); k--;) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + a += 4; + b += 2; + } + + if ((bk - 1) & 1) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + a += 4; + b += 2; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + } + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c1 *= src_b3; + + ST_SP2(src_c0, src_c1, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); +} + +static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk) + { + BLASLONG k; + FLOAT t0, t1, t2, t3; + + t0 = a[0] * b[0]; + t1 = a[1] * b[0]; + t2 = a[2] * b[0]; + t3 = a[3] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 4; + b += 1; + + t0 += a[0] * b[0]; + t1 += a[1] * b[0]; + t2 += a[2] * b[0]; + t3 += a[3] * b[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + + a += 4; + b += 1; + } + + b0 = *(b + 0); + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; + FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31; + FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + FLOAT c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT res[16]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + res[8] = a[0] * b[4]; + res[9] = a[1] * b[4]; + res[10] = a[0] * b[5]; + res[11] = a[1] * b[5]; + res[12] = a[0] * b[6]; + res[13] = a[1] * b[6]; + res[14] = a[0] * b[7]; + res[15] = a[1] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 8; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + res[8] += a[0] * b[4]; + res[9] += a[1] * b[4]; + res[10] += a[0] * b[5]; + res[11] += a[1] * b[5]; + res[12] += a[0] * b[6]; + res[13] += a[1] * b[6]; + res[14] += a[0] * b[7]; + res[15] += a[1] * b[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + + a += 2; + b += 8; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b4 = *(b + 4); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b9 = *(b + 9); + b10 = *(b + 10); + b11 = *(b + 11); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + b18 = *(b + 18); + b19 = *(b + 19); + b20 = *(b + 20); + b21 = *(b + 21); + b22 = *(b + 22); + b23 = *(b + 23); + b27 = *(b + 27); + b28 = *(b + 28); + b29 = *(b + 29); + b30 = *(b + 30); + b31 = *(b + 31); + b36 = *(b + 36); + b37 = *(b + 37); + b38 = *(b + 38); + b39 = *(b + 39); + b45 = *(b + 45); + b46 = *(b + 46); + b47 = *(b + 47); + b54 = *(b + 54); + b55 = *(b + 55); + b63 = *(b + 63); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + + c0_nxt4 -= c0 * b4; + c1_nxt4 -= c1 * b4; + + c0_nxt5 -= c0 * b5; + c1_nxt5 -= c1 * b5; + + c0_nxt6 -= c0 * b6; + c1_nxt6 -= c1 * b6; + + c0_nxt7 -= c0 * b7; + c1_nxt7 -= c1 * b7; + + c0_nxt1 *= b9; + c1_nxt1 *= b9; + + c0_nxt2 -= c0_nxt1 * b10; + c1_nxt2 -= c1_nxt1 * b10; + + c0_nxt3 -= c0_nxt1 * b11; + c1_nxt3 -= c1_nxt1 * b11; + + c0_nxt4 -= c0_nxt1 * b12; + c1_nxt4 -= c1_nxt1 * b12; + + c0_nxt5 -= c0_nxt1 * b13; + c1_nxt5 -= c1_nxt1 * b13; + + c0_nxt6 -= c0_nxt1 * b14; + c1_nxt6 -= c1_nxt1 * b14; + + c0_nxt7 -= c0_nxt1 * b15; + c1_nxt7 -= c1_nxt1 * b15; + + c0_nxt2 *= b18; + c1_nxt2 *= b18; + + c0_nxt3 -= c0_nxt2 * b19; + c1_nxt3 -= c1_nxt2 * b19; + + c0_nxt4 -= c0_nxt2 * b20; + c1_nxt4 -= c1_nxt2 * b20; + + c0_nxt5 -= c0_nxt2 * b21; + c1_nxt5 -= c1_nxt2 * b21; + + c0_nxt6 -= c0_nxt2 * b22; + c1_nxt6 -= c1_nxt2 * b22; + + c0_nxt7 -= c0_nxt2 * b23; + c1_nxt7 -= c1_nxt2 * b23; + + c0_nxt3 *= b27; + c1_nxt3 *= b27; + + c0_nxt4 -= c0_nxt3 * b28; + c1_nxt4 -= c1_nxt3 * b28; + + c0_nxt5 -= c0_nxt3 * b29; + c1_nxt5 -= c1_nxt3 * b29; + + c0_nxt6 -= c0_nxt3 * b30; + c1_nxt6 -= c1_nxt3 * b30; + + c0_nxt7 -= c0_nxt3 * b31; + c1_nxt7 -= c1_nxt3 * b31; + + c0_nxt4 *= b36; + c1_nxt4 *= b36; + + c0_nxt5 -= c0_nxt4 * b37; + c1_nxt5 -= c1_nxt4 * b37; + + c0_nxt6 -= c0_nxt4 * b38; + c1_nxt6 -= c1_nxt4 * b38; + + c0_nxt7 -= c0_nxt4 * b39; + c1_nxt7 -= c1_nxt4 * b39; + + c0_nxt5 *= b45; + c1_nxt5 *= b45; + + c0_nxt6 -= c0_nxt5 * b46; + c1_nxt6 -= c1_nxt5 * b46; + + c0_nxt7 -= c0_nxt5 * b47; + c1_nxt7 -= c1_nxt5 * b47; + + c0_nxt6 *= b54; + c1_nxt6 *= b54; + + c0_nxt7 -= c0_nxt6 * b55; + c1_nxt7 -= c1_nxt6 * b55; + + c0_nxt7 *= b63; + c1_nxt7 *= b63; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + *(a + 8) = c0_nxt4; + *(a + 9) = c1_nxt4; + *(a + 10) = c0_nxt5; + *(a + 11) = c1_nxt5; + *(a + 12) = c0_nxt6; + *(a + 13) = c1_nxt6; + *(a + 14) = c0_nxt7; + *(a + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1; + FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT res[8]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 4; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + + a += 2; + b += 4; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c1_nxt2 -= c1_nxt1 * b6; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c1_nxt3 -= c1_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c1_nxt3 -= c1_nxt2 * b11; + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk) + { + BLASLONG k; + FLOAT res[4]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 2; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt -= res[2]; + c1_nxt -= res[3]; + + a += 2; + b += 2; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + c1 *= b0; + + c0_nxt -= c0 * b1; + c1_nxt -= c1 * b1; + + c0_nxt *= b3; + c1_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[1] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 1; + + res0 += a[0] * b[0]; + res1 += a[1] * b[0]; + } + + c0 -= res0; + c1 -= res1; + + a += 2; + b += 1; + } + + b0 = *(b + 0); + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; + FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38; + FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = a[0] * b[0]; + t1 = a[0] * b[1]; + t2 = a[0] * b[2]; + t3 = a[0] * b[3]; + t4 = a[0] * b[4]; + t5 = a[0] * b[5]; + t6 = a[0] * b[6]; + t7 = a[0] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 8; + + t0 += a[0] * b[0]; + t1 += a[0] * b[1]; + t2 += a[0] * b[2]; + t3 += a[0] * b[3]; + t4 += a[0] * b[4]; + t5 += a[0] * b[5]; + t6 += a[0] * b[6]; + t7 += a[0] * b[7]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + c4 -= t4; + c5 -= t5; + c6 -= t6; + c7 -= t7; + + a += 1; + b += 8; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b4 = *(b + 4); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b9 = *(b + 9); + b10 = *(b + 10); + b11 = *(b + 11); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + b18 = *(b + 18); + b19 = *(b + 19); + b20 = *(b + 20); + b21 = *(b + 21); + b22 = *(b + 22); + b23 = *(b + 23); + b27 = *(b + 27); + b28 = *(b + 28); + b29 = *(b + 29); + b30 = *(b + 30); + b31 = *(b + 31); + b36 = *(b + 36); + b37 = *(b + 37); + b38 = *(b + 38); + b39 = *(b + 39); + b45 = *(b + 45); + b46 = *(b + 46); + b47 = *(b + 47); + b54 = *(b + 54); + b55 = *(b + 55); + b63 = *(b + 63); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b9; + + c2 -= c0 * b2; + c2 -= c1 * b10; + c2 *= b18; + + c3 -= c0 * b3; + c3 -= c1 * b11; + c3 -= c2 * b19; + c3 *= b27; + + c4 -= c0 * b4; + c4 -= c1 * b12; + c4 -= c2 * b20; + c4 -= c3 * b28; + c4 *= b36; + + c5 -= c0 * b5; + c5 -= c1 * b13; + c5 -= c2 * b21; + c5 -= c3 * b29; + c5 -= c4 * b37; + c5 *= b45; + + c6 -= c0 * b6; + c6 -= c1 * b14; + c6 -= c2 * b22; + c6 -= c3 * b30; + c6 -= c4 * b38; + c6 -= c5 * b46; + c6 *= b54; + + c7 -= c0 * b7; + c7 -= c1 * b15; + c7 -= c2 * b23; + c7 -= c3 * b31; + c7 -= c4 * b39; + c7 -= c5 * b47; + c7 -= c6 * b55; + c7 *= b63; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + *(a + 4) = c4; + *(a + 5) = c5; + *(a + 6) = c6; + *(a + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT res0, res1, res2, res3; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + res2 = a[0] * b[2]; + res3 = a[0] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 4; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + res2 += a[0] * b[2]; + res3 += a[0] * b[3]; + } + + c0 -= res0; + c1 -= res1; + c2 -= res2; + c3 -= res3; + + a += 1; + b += 4; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b5; + + c2 -= c0 * b2; + c2 -= c1 * b6; + c2 *= b10; + + c3 -= c0 * b3; + c3 -= c1 * b7; + c3 -= c2 * b11; + c3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + ldc); + + if (bk) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 2; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + } + + c0 -= res0; + c1 -= res1; + + a += 1; + b += 2; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk) + { + BLASLONG k; + FLOAT res; + + res = a[0] * b[0]; + + for (k = (bk - 1); k--;) + { + a++; + b++; + + res += a[0] * b[0]; + } + + *c -= res; + + a++; + b++; + } + + *c *= *b; + *a = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + kk = -offset; + + for (j = (n >> 3); j--;) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x8_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + kk += 8; + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x4_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + kk += 4; + } + + if (n & 2) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x2_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + kk += 2; + } + + if (n & 1) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x1_rn_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + } + } + + b += k; + c += ldc; + kk += 1; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c new file mode 100644 index 000000000..25a8a0b6e --- /dev/null +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -0,0 +1,2118 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; + v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; + v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; + v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; + v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 8; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + b -= 64; + + src_b = LD_SP(b + 60); + SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); + src_b = LD_SP(b + 56); + SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); + + src_c15 *= src_b63; + src_c14 *= src_b63; + src_c13 -= src_c15 * src_b62; + src_c12 -= src_c14 * src_b62; + src_c11 -= src_c15 * src_b61; + src_c10 -= src_c14 * src_b61; + src_c9 -= src_c15 * src_b60; + src_c8 -= src_c14 * src_b60; + src_c7 -= src_c15 * src_b59; + src_c6 -= src_c14 * src_b59; + src_c5 -= src_c15 * src_b58; + src_c4 -= src_c14 * src_b58; + src_c3 -= src_c15 * src_b57; + src_c2 -= src_c14 * src_b57; + src_c1 -= src_c15 * src_b56; + src_c0 -= src_c14 * src_b56; + + src_b = LD_SP(b + 48); + SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); + src_b52 = LD_SP(b + 52); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); + src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); + src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); + + src_c12 *= src_b54; + src_c13 *= src_b54; + src_c10 -= src_c12 * src_b53; + src_c11 -= src_c13 * src_b53; + src_c8 -= src_c12 * src_b52; + src_c9 -= src_c13 * src_b52; + src_c6 -= src_c12 * src_b51; + src_c7 -= src_c13 * src_b51; + src_c4 -= src_c12 * src_b50; + src_c5 -= src_c13 * src_b50; + src_c2 -= src_c12 * src_b49; + src_c3 -= src_c13 * src_b49; + src_c0 -= src_c12 * src_b48; + src_c1 -= src_c13 * src_b48; + + ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4); + ST_SP2(src_c12, src_c13, c_nxt6line, 4); + ST_SP2(src_c14, src_c15, c_nxt7line, 4); + + src_b = LD_SP(b + 40); + SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); + src_b44 = LD_SP(b + 44); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); + src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); + + src_c10 *= src_b45; + src_c11 *= src_b45; + src_c8 -= src_c10 * src_b44; + src_c9 -= src_c11 * src_b44; + src_c6 -= src_c10 * src_b43; + src_c7 -= src_c11 * src_b43; + src_c4 -= src_c10 * src_b42; + src_c5 -= src_c11 * src_b42; + src_c2 -= src_c10 * src_b41; + src_c3 -= src_c11 * src_b41; + src_c0 -= src_c10 * src_b40; + src_c1 -= src_c11 * src_b40; + + src_b = LD_SP(b + 32); + SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); + src_b36 = __msa_cast_to_vector_float(*(b + 36)); + src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + + src_c8 *= src_b36; + src_c9 *= src_b36; + src_c6 -= src_c8 * src_b35; + src_c7 -= src_c9 * src_b35; + src_c4 -= src_c8 * src_b34; + src_c5 -= src_c9 * src_b34; + src_c2 -= src_c8 * src_b33; + src_c3 -= src_c9 * src_b33; + src_c0 -= src_c8 * src_b32; + src_c1 -= src_c9 * src_b32; + + ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4); + ST_SP2(src_c8, src_c9, c_nxt4line, 4); + ST_SP2(src_c10, src_c11, c_nxt5line, 4); + + src_b = LD_SP(b + 24); + SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); + + src_c6 *= src_b27; + src_c7 *= src_b27; + src_c4 -= src_c6 * src_b26; + src_c5 -= src_c7 * src_b26; + src_c2 -= src_c6 * src_b25; + src_c3 -= src_c7 * src_b25; + src_c0 -= src_c6 * src_b24; + src_c1 -= src_c7 * src_b24; + + src_b16 = LD_SP(b + 16); + src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); + src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); + src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); + + src_c4 *= src_b18; + src_c5 *= src_b18; + src_c2 -= src_c4 * src_b17; + src_c3 -= src_c5 * src_b17; + src_c0 -= src_c4 * src_b16; + src_c1 -= src_c5 * src_b16; + + ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); + + src_b9 = __msa_cast_to_vector_float(*(b + 9)); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); + src_b8 = __msa_cast_to_vector_float(*(b + 8)); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c2 *= src_b9; + src_c3 *= src_b9; + src_c0 -= src_c2 * src_b8; + src_c1 -= src_c3 * src_b8; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12; + v4f32 src_b13, src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + for (k = (bk - 1) / 2; k--;) + { + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + } + + if ((bk - 1) & 1) + { + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + a -= 32; + b -= 16; + + src_b = LD_SP(b + 12); + SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); + src_b8 = LD_SP(b + 8); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b5 = __msa_cast_to_vector_float(*(b + 5)); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b4 = __msa_cast_to_vector_float(*(b + 4)); + src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c7 *= src_b15; + src_c6 *= src_b15; + src_c5 -= src_c7 * src_b14; + src_c4 -= src_c6 * src_b14; + src_c3 -= src_c7 * src_b13; + src_c2 -= src_c6 * src_b13; + src_c1 -= src_c7 * src_b12; + src_c0 -= src_c6 * src_b12; + + src_c5 *= src_b10; + src_c4 *= src_b10; + src_c3 -= src_c5 * src_b9; + src_c2 -= src_c4 * src_b9; + src_c1 -= src_c5 * src_b8; + src_c0 -= src_c4 * src_b8; + + src_c3 *= src_b5; + src_c2 *= src_b5; + src_c1 -= src_c3 * src_b4; + src_c0 -= src_c2 * src_b4; + + src_c1 *= src_b0; + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, res0, res1, res2, res3; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + + for (k = (bk - 1) >> 1; k--;) + { + aa += 8; + bb += 2; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + aa += 8; + bb += 2; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + } + + if ((bk - 1) & 1) + { + aa += 8; + bb += 2; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + } + + a -= 16; + b -= 4; + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b2 = __msa_cast_to_vector_float(*(b + 2)); + src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + + src_c2 *= src_b3; + src_c3 *= src_b3; + src_c0 -= src_c2 * src_b2; + src_c1 -= src_c3 * src_b2; + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, res0, res1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + + for (k = (bk - 1) >> 2; k--;) + { + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + } + + if ((bk - 1) & 3) + { + if ((bk - 1) & 2) + { + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + } + + if ((bk - 1) & 1) + { + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + } + } + + LD_SP2(c, 4, src_c0, src_c1); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + } + + a -= 8; + b -= 1; + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); +} + +static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; + v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; + v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; + v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; + v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 8; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + a -= 32; + b -= 64; + + src_b = LD_SP(b + 60); + SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); + src_b = LD_SP(b + 56); + SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); + + src_b = LD_SP(b + 48); + SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); + src_b52 = LD_SP(b + 52); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); + src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); + src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); + + src_b = LD_SP(b + 40); + SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); + src_b44 = LD_SP(b + 44); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); + src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); + + src_b = LD_SP(b + 32); + SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); + src_b36 = __msa_cast_to_vector_float(*(b + 36)); + src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + + src_b = LD_SP(b + 24); + SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); + + src_b16 = LD_SP(b + 16); + src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); + src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); + src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); + + src_b9 = __msa_cast_to_vector_float(*(b + 9)); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); + src_b8 = __msa_cast_to_vector_float(*(b + 8)); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c7 *= src_b63; + src_c6 -= src_c7 * src_b62; + src_c5 -= src_c7 * src_b61; + src_c4 -= src_c7 * src_b60; + src_c3 -= src_c7 * src_b59; + src_c2 -= src_c7 * src_b58; + src_c1 -= src_c7 * src_b57; + src_c0 -= src_c7 * src_b56; + + src_c6 *= src_b54; + src_c5 -= src_c6 * src_b53; + src_c4 -= src_c6 * src_b52; + src_c3 -= src_c6 * src_b51; + src_c2 -= src_c6 * src_b50; + src_c1 -= src_c6 * src_b49; + src_c0 -= src_c6 * src_b48; + + src_c5 *= src_b45; + src_c4 -= src_c5 * src_b44; + src_c3 -= src_c5 * src_b43; + src_c2 -= src_c5 * src_b42; + src_c1 -= src_c5 * src_b41; + src_c0 -= src_c5 * src_b40; + + src_c4 *= src_b36; + src_c3 -= src_c4 * src_b35; + src_c2 -= src_c4 * src_b34; + src_c1 -= src_c4 * src_b33; + src_c0 -= src_c4 * src_b32; + + src_c3 *= src_b27; + src_c2 -= src_c3 * src_b26; + src_c1 -= src_c3 * src_b25; + src_c0 -= src_c3 * src_b24; + + src_c2 *= src_b18; + src_c1 -= src_c2 * src_b17; + src_c0 -= src_c2 * src_b16; + + src_c1 *= src_b9; + src_c0 -= src_c1 * src_b8; + + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b; + v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v4f32 src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a, src_b1, src_b2, src_b3, res0, res1, res2, res3; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a * src_b0; + res1 = src_a * src_b1; + res2 = src_a * src_b2; + res3 = src_a * src_b3; + + for (k = ((bk - 1) >> 1); k--;) + { + aa += 4; + bb += 4; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + res2 += src_a * src_b2; + res3 += src_a * src_b3; + + aa += 4; + bb += 4; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + res2 += src_a * src_b2; + res3 += src_a * src_b3; + } + + if ((bk - 1) & 1) + { + aa += 4; + bb += 4; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + res2 += src_a * src_b2; + res3 += src_a * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + a -= 16; + b -= 16; + + src_b = LD_SP(b + 12); + SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); + src_b8 = LD_SP(b + 8); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b5 = __msa_cast_to_vector_float(*(b + 5)); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b4 = __msa_cast_to_vector_float(*(b + 4)); + src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c3 *= src_b15; + src_c2 -= src_c3 * src_b14; + src_c1 -= src_c3 * src_b13; + src_c0 -= src_c3 * src_b12; + + src_c2 *= src_b10; + src_c1 -= src_c2 * src_b9; + src_c0 -= src_c2 * src_b8; + + src_c1 *= src_b5; + src_c0 -= src_c1 * src_b4; + + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0, src_b2, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a, src_b1, res0, res1; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a * src_b0; + res1 = src_a * src_b1; + + for (k = ((bk - 1) >> 1); k--;) + { + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + } + + if ((bk - 1) & 1) + { + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + } + + a -= 8; + b -= 4; + + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + src_b2 = __msa_cast_to_vector_float(*(b + 2)); + src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c1 *= src_b3; + src_c0 -= src_c1 * src_b2; + src_c0 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); +} + +static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3; + + t0 = aa[0] * bb[0]; + t1 = aa[1] * bb[0]; + t2 = aa[2] * bb[0]; + t3 = aa[3] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 1; + + t0 += aa[0] * bb[0]; + t1 += aa[1] * bb[0]; + t2 += aa[2] * bb[0]; + t3 += aa[3] * bb[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + } + + a -= 4; + b -= 1; + + b0 = *b; + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; + FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; + FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[16]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + res[8] = aa[0] * bb[4]; + res[9] = aa[1] * bb[4]; + res[10] = aa[0] * bb[5]; + res[11] = aa[1] * bb[5]; + res[12] = aa[0] * bb[6]; + res[13] = aa[1] * bb[6]; + res[14] = aa[0] * bb[7]; + res[15] = aa[1] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 8; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + res[8] += aa[0] * bb[4]; + res[9] += aa[1] * bb[4]; + res[10] += aa[0] * bb[5]; + res[11] += aa[1] * bb[5]; + res[12] += aa[0] * bb[6]; + res[13] += aa[1] * bb[6]; + res[14] += aa[0] * bb[7]; + res[15] += aa[1] * bb[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + } + + a -= 16; + b -= 64; + + b0 = *(b + 0); + b8 = *(b + 8); + b9 = *(b + 9); + b16 = *(b + 16); + b17 = *(b + 17); + b18 = *(b + 18); + b24 = *(b + 24); + b25 = *(b + 25); + b26 = *(b + 26); + b27 = *(b + 27); + b32 = *(b + 32); + b33 = *(b + 33); + b34 = *(b + 34); + b35 = *(b + 35); + b36 = *(b + 36); + b40 = *(b + 40); + b41 = *(b + 41); + b42 = *(b + 42); + b43 = *(b + 43); + b44 = *(b + 44); + b45 = *(b + 45); + b48 = *(b + 48); + b49 = *(b + 49); + b50 = *(b + 50); + b51 = *(b + 51); + b52 = *(b + 52); + b53 = *(b + 53); + b54 = *(b + 54); + b56 = *(b + 56); + b57 = *(b + 57); + b58 = *(b + 58); + b59 = *(b + 59); + b60 = *(b + 60); + b61 = *(b + 61); + b62 = *(b + 62); + b63 = *(b + 63); + + c0_nxt7 *= b63; + c1_nxt7 *= b63; + + c0_nxt6 -= c0_nxt7 * b62; + c1_nxt6 -= c1_nxt7 * b62; + + c0_nxt6 *= b54; + c1_nxt6 *= b54; + + c0_nxt5 -= c0_nxt7 * b61; + c1_nxt5 -= c1_nxt7 * b61; + + c0_nxt5 -= c0_nxt6 * b53; + c1_nxt5 -= c1_nxt6 * b53; + + c0_nxt5 *= b45; + c1_nxt5 *= b45; + + c0_nxt4 -= c0_nxt7 * b60; + c1_nxt4 -= c1_nxt7 * b60; + + c0_nxt4 -= c0_nxt6 * b52; + c1_nxt4 -= c1_nxt6 * b52; + + c0_nxt4 -= c0_nxt5 * b44; + c1_nxt4 -= c1_nxt5 * b44; + + c0_nxt4 *= b36; + c1_nxt4 *= b36; + + c0_nxt3 -= c0_nxt7 * b59; + c1_nxt3 -= c1_nxt7 * b59; + + c0_nxt3 -= c0_nxt6 * b51; + c1_nxt3 -= c1_nxt6 * b51; + + c0_nxt3 -= c0_nxt5 * b43; + c1_nxt3 -= c1_nxt5 * b43; + + c0_nxt3 -= c0_nxt4 * b35; + c1_nxt3 -= c1_nxt4 * b35; + + c0_nxt3 *= b27; + c1_nxt3 *= b27; + + c0_nxt2 -= c0_nxt7 * b58; + c1_nxt2 -= c1_nxt7 * b58; + + c0_nxt2 -= c0_nxt6 * b50; + c1_nxt2 -= c1_nxt6 * b50; + + c0_nxt2 -= c0_nxt5 * b42; + c1_nxt2 -= c1_nxt5 * b42; + + c0_nxt2 -= c0_nxt4 * b34; + c1_nxt2 -= c1_nxt4 * b34; + + c0_nxt2 -= c0_nxt3 * b26; + c1_nxt2 -= c1_nxt3 * b26; + + c0_nxt2 *= b18; + c1_nxt2 *= b18; + + c0_nxt1 -= c0_nxt7 * b57; + c1_nxt1 -= c1_nxt7 * b57; + + c0_nxt1 -= c0_nxt6 * b49; + c1_nxt1 -= c1_nxt6 * b49; + + c0_nxt1 -= c0_nxt5 * b41; + c1_nxt1 -= c1_nxt5 * b41; + + c0_nxt1 -= c0_nxt4 * b33; + c1_nxt1 -= c1_nxt4 * b33; + + c0_nxt1 -= c0_nxt3 * b25; + c1_nxt1 -= c1_nxt3 * b25; + + c0_nxt1 -= c0_nxt2 * b17; + c1_nxt1 -= c1_nxt2 * b17; + + c0_nxt1 *= b9; + c1_nxt1 *= b9; + + c0 -= c0_nxt7 * b56; + c1 -= c1_nxt7 * b56; + + c0 -= c0_nxt6 * b48; + c1 -= c1_nxt6 * b48; + + c0 -= c0_nxt5 * b40; + c1 -= c1_nxt5 * b40; + + c0 -= c0_nxt4 * b32; + c1 -= c1_nxt4 * b32; + + c0 -= c0_nxt3 * b24; + c1 -= c1_nxt3 * b24; + + c0 -= c0_nxt2 * b16; + c1 -= c1_nxt2 * b16; + + c0 -= c0_nxt1 * b8; + c1 -= c1_nxt1 * b8; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + *(a + 8) = c0_nxt4; + *(a + 9) = c1_nxt4; + *(a + 10) = c0_nxt5; + *(a + 11) = c1_nxt5; + *(a + 12) = c0_nxt6; + *(a + 13) = c1_nxt6; + *(a + 14) = c0_nxt7; + *(a + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[8]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 4; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + } + + a -= 8; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10; + c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10; + + c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5; + c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5; + + c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0; + c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[4]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 2; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt -= res[2]; + c1_nxt -= res[3]; + } + + a -= 4; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + c1_nxt *= b3; + + c0 -= c0_nxt * b2; + c1 -= c1_nxt * b2; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1; + + res0 = aa[0] * bb[0]; + res1 = aa[1] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 1; + + res0 += aa[0] * bb[0]; + res1 += aa[1] * bb[0]; + } + + c0 -= res0; + c1 -= res1; + } + + a -= 2; + b -= 1; + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; + FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; + FLOAT b56, b57, b58, b59, b60, b61, b62, b63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = aa[0] * bb[0]; + t1 = aa[0] * bb[1]; + t2 = aa[0] * bb[2]; + t3 = aa[0] * bb[3]; + t4 = aa[0] * bb[4]; + t5 = aa[0] * bb[5]; + t6 = aa[0] * bb[6]; + t7 = aa[0] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 8; + + t0 += aa[0] * bb[0]; + t1 += aa[0] * bb[1]; + t2 += aa[0] * bb[2]; + t3 += aa[0] * bb[3]; + t4 += aa[0] * bb[4]; + t5 += aa[0] * bb[5]; + t6 += aa[0] * bb[6]; + t7 += aa[0] * bb[7]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + c4 -= t4; + c5 -= t5; + c6 -= t6; + c7 -= t7; + } + + a -= 8; + b -= 64; + + b0 = *(b + 0); + b8 = *(b + 8); + b9 = *(b + 9); + b16 = *(b + 16); + b17 = *(b + 17); + b18 = *(b + 18); + b24 = *(b + 24); + b25 = *(b + 25); + b26 = *(b + 26); + b27 = *(b + 27); + b32 = *(b + 32); + b33 = *(b + 33); + b34 = *(b + 34); + b35 = *(b + 35); + b36 = *(b + 36); + b40 = *(b + 40); + b41 = *(b + 41); + b42 = *(b + 42); + b43 = *(b + 43); + b44 = *(b + 44); + b45 = *(b + 45); + b48 = *(b + 48); + b49 = *(b + 49); + b50 = *(b + 50); + b51 = *(b + 51); + b52 = *(b + 52); + b53 = *(b + 53); + b54 = *(b + 54); + b56 = *(b + 56); + b57 = *(b + 57); + b58 = *(b + 58); + b59 = *(b + 59); + b60 = *(b + 60); + b61 = *(b + 61); + b62 = *(b + 62); + b63 = *(b + 63); + + c7 *= b63; + + c6 -= c7 * b62; + c6 *= b54; + + c5 -= c7 * b61; + c5 -= c6 * b53; + c5 *= b45; + + c4 -= c7 * b60; + c4 -= c6 * b52; + c4 -= c5 * b44; + c4 *= b36; + + c3 -= c7 * b59; + c3 -= c6 * b51; + c3 -= c5 * b43; + c3 -= c4 * b35; + c3 *= b27; + + c2 -= c7 * b58; + c2 -= c6 * b50; + c2 -= c5 * b42; + c2 -= c4 * b34; + c2 -= c3 * b26; + c2 *= b18; + + c1 -= c7 * b57; + c1 -= c6 * b49; + c1 -= c5 * b41; + c1 -= c4 * b33; + c1 -= c3 * b25; + c1 -= c2 * b17; + c1 *= b9; + + c0 -= c7 * b56; + c0 -= c6 * b48; + c0 -= c5 * b40; + c0 -= c4 * b32; + c0 -= c3 * b24; + c0 -= c2 * b16; + c0 -= c1 * b8; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + *(a + 4) = c4; + *(a + 5) = c5; + *(a + 6) = c6; + *(a + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1, res2, res3; + + res0 = aa[0] * bb[0]; + res1 = aa[0] * bb[1]; + res2 = aa[0] * bb[2]; + res3 = aa[0] * bb[3]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 4; + + res0 += aa[0] * bb[0]; + res1 += aa[0] * bb[1]; + res2 += aa[0] * bb[2]; + res3 += aa[0] * bb[3]; + } + + c0 -= res0; + c1 -= res1; + c2 -= res2; + c3 -= res3; + } + + a -= 4; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c3 *= b15; + c2 = (c2 - c3 * b14) * b10; + c1 = ((c1 - c3 * b13) - c2 * b9) * b5; + c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1; + + res0 = aa[0] * bb[0]; + res1 = aa[0] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 2; + + res0 += aa[0] * bb[0]; + res1 += aa[0] * bb[1]; + } + + c0 -= res0; + c1 -= res1; + } + + a -= 2; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c1 *= b3; + + c0 -= c1 * b2; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res; + + res = *aa * *bb; + + for (k = (bk - 1); k--;) + { + aa++; + bb++; + + res += *aa * *bb; + } + + *c -= res; + } + + *c *= *(a - 1); + *(b - 1) = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + kk = n - offset; + c += n * ldc; + b += n * k; + + if (n & 7) + { + if (n & 1) + { + aa = a; + b -= k; + c -= ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 1; + } + + if (n & 2) + { + aa = a; + b -= 2 * k; + c -= 2 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 2; + } + + if (n & 4) + { + aa = a; + b -= 4 * k; + c -= 4 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 4; + } + } + + for (j = (n >> 3); j--;) + { + aa = a; + b -= 8 * k; + c -= 8 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 8; + } + + return 0; +} From 7d0358475deed393a9f27ca5d9d5055c795df470 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 22 May 2016 01:08:44 +0800 Subject: [PATCH 38/70] Merge the patch for musl libc. --- utest/ctest.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/ctest.h b/utest/ctest.h index a62103ff5..1deea32f6 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -637,7 +637,7 @@ static void *find_symbol(struct ctest *test, const char *fname) static void sighandler(int signum) { char msg[128]; - sprintf(msg, "[SIGNAL %d: %s]", signum, sys_siglist[signum]); + snprintf(msg, sizeof(msg), "[SIGNAL %d: %s]", signum, strsignal(signum)); color_print(ANSI_BRED, msg); fflush(stdout); From 8fb5a1aaff3cc9de190a49aad046613dde2f72c2 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 22 May 2016 13:09:05 +0200 Subject: [PATCH 39/70] added optimized dtrsm_LT kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/dtrsm_kernel_LT_16x4_power8.S | 293 ++ kernel/power/dtrsm_logic_LT_16x4_power8.S | 716 +++ kernel/power/dtrsm_macros_LT_16x4_power8.S | 4623 ++++++++++++++++++++ 4 files changed, 5633 insertions(+), 1 deletion(-) create mode 100644 kernel/power/dtrsm_kernel_LT_16x4_power8.S create mode 100644 kernel/power/dtrsm_logic_LT_16x4_power8.S create mode 100644 kernel/power/dtrsm_macros_LT_16x4_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 8e3d084aa..323b67d05 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -54,7 +54,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S new file mode 100644 index 000000000..e1c6249f8 --- /dev/null +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -0,0 +1,293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define o0 0 + +#define PRE r15 +#define T4 r16 +#define L r17 +#define T3 r18 +#define T2 r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO r25 +#define o8 r26 +#define o16 r27 +#define o24 r28 +#define o32 r29 +#define o48 r30 +#define T1 r31 + +#include "dtrsm_macros_LT_16x4_power8.S" + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, BASE_SHIFT + + li o8, 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + mr KK, OFFSET + +#include "dtrsm_logic_LT_16x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/dtrsm_logic_LT_16x4_power8.S b/kernel/power/dtrsm_logic_LT_16x4_power8.S new file mode 100644 index 000000000..d5d34b422 --- /dev/null +++ b/kernel/power/dtrsm_logic_LT_16x4_power8.S @@ -0,0 +1,716 @@ + srawi. J, N, 2 + ble DSTRM_LT_L4_END + + +DSTRM_LT_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + + mr KK, OFFSET + srawi. I, M, 4 + ble DSTRM_LT_L4x16_END + + +DSTRM_LT_L4x16_BEGIN: + + mr BO, B + + +DSTRM_LT_L4x16_LOOP_START: + + + INIT_16x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x16_SAVE + +DSTRM_LT_L4x16_LOOP: + + + KERNEL_16x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x16_LOOP + + +DSTRM_LT_L4x16_SAVE: + + SOLVE_LT_16x4 + + addi CO, CO, 16*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 4+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 16 + + addic. I, I, -1 + bgt DSTRM_LT_L4x16_BEGIN + +DSTRM_LT_L4x16_END: + + +DSTRM_LT_L4x8_BEGIN: + + andi. T2, M, 15 + ble DSTRM_LT_L4x1_END + + andi. T1, M, 8 + ble DSTRM_LT_L4x8_END + + mr BO, B + + +DSTRM_LT_L4x8_LOOP_START: + + + INIT_8x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x8_SAVE + +DSTRM_LT_L4x8_LOOP: + + + KERNEL_8x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x8_LOOP + + +DSTRM_LT_L4x8_SAVE: + + SOLVE_LT_8x4 + + addi CO, CO, 8*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 3+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 8 + +DSTRM_LT_L4x8_END: + + +DSTRM_LT_L4x4_BEGIN: + + andi. T1, M, 4 + ble DSTRM_LT_L4x4_END + + mr BO, B + + +DSTRM_LT_L4x4_LOOP_START: + + + INIT_4x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x4_SAVE + +DSTRM_LT_L4x4_LOOP: + + + KERNEL_4x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x4_LOOP + + +DSTRM_LT_L4x4_SAVE: + + SOLVE_LT_4x4 + + addi CO, CO, 4*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 2+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 4 + +DSTRM_LT_L4x4_END: + + +DSTRM_LT_L4x2_BEGIN: + + andi. T1, M, 2 + ble DSTRM_LT_L4x2_END + + mr BO, B + + +DSTRM_LT_L4x2_LOOP_START: + + + INIT_2x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x2_SAVE + +DSTRM_LT_L4x2_LOOP: + + + KERNEL_2x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x2_LOOP + + +DSTRM_LT_L4x2_SAVE: + + SOLVE_LT_2x4 + + addi CO, CO, 2*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 1+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 2 + +DSTRM_LT_L4x2_END: + + +DSTRM_LT_L4x1_BEGIN: + + andi. T1, M, 1 + ble DSTRM_LT_L4x1_END + + mr BO, B + + +DSTRM_LT_L4x1_LOOP_START: + + + INIT_1x4 + + + addic. L, KK, 0 + ble DSTRM_LT_L4x1_SAVE + +DSTRM_LT_L4x1_LOOP: + + + KERNEL_1x4 + + addic. L, L, -1 + bgt DSTRM_LT_L4x1_LOOP + + +DSTRM_LT_L4x1_SAVE: + + SOLVE_LT_1x4 + + addi CO, CO, 1*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 0+BASE_SHIFT + slwi T4, T4, 2+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 1 + +DSTRM_LT_L4x1_END: + + slwi T1, K, 2+BASE_SHIFT + add B, B, T1 + + addic. J, J, -1 + bgt DSTRM_LT_L4_BEGIN + + andi. T2, N, 3 + ble L999 + +DSTRM_LT_L4_END: + + b DSTRM_LT_L2_BEGIN + +L999_H1: + + b L999 + + +DSTRM_LT_L2_BEGIN: + + andi. T1, N, 2 + ble DSTRM_LT_L2_END + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + + mr KK, OFFSET + srawi. I, M, 4 + ble DSTRM_LT_L2x16_END + + +DSTRM_LT_L2x16_BEGIN: + + mr BO, B + + +DSTRM_LT_L2x16_LOOP_START: + + + INIT_16x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x16_SAVE + +DSTRM_LT_L2x16_LOOP: + + + KERNEL_16x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x16_LOOP + + +DSTRM_LT_L2x16_SAVE: + + SOLVE_LT_16x2 + + addi CO, CO, 16*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 4+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 16 + + addic. I, I, -1 + bgt DSTRM_LT_L2x16_BEGIN + +DSTRM_LT_L2x16_END: + + +DSTRM_LT_L2x8_BEGIN: + + andi. T2, M, 15 + ble DSTRM_LT_L2x1_END + + andi. T1, M, 8 + ble DSTRM_LT_L2x8_END + + mr BO, B + + +DSTRM_LT_L2x8_LOOP_START: + + + INIT_8x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x8_SAVE + +DSTRM_LT_L2x8_LOOP: + + + KERNEL_8x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x8_LOOP + + +DSTRM_LT_L2x8_SAVE: + + SOLVE_LT_8x2 + + addi CO, CO, 8*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 3+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 8 + +DSTRM_LT_L2x8_END: + + +DSTRM_LT_L2x4_BEGIN: + + andi. T1, M, 4 + ble DSTRM_LT_L2x4_END + + mr BO, B + + +DSTRM_LT_L2x4_LOOP_START: + + + INIT_4x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x4_SAVE + +DSTRM_LT_L2x4_LOOP: + + + KERNEL_4x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x4_LOOP + + +DSTRM_LT_L2x4_SAVE: + + SOLVE_LT_4x2 + + addi CO, CO, 4*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 2+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 4 + +DSTRM_LT_L2x4_END: + + +DSTRM_LT_L2x2_BEGIN: + + andi. T1, M, 2 + ble DSTRM_LT_L2x2_END + + mr BO, B + + +DSTRM_LT_L2x2_LOOP_START: + + + INIT_2x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x2_SAVE + +DSTRM_LT_L2x2_LOOP: + + + KERNEL_2x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x2_LOOP + + +DSTRM_LT_L2x2_SAVE: + + SOLVE_LT_2x2 + + addi CO, CO, 2*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 1+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 2 + +DSTRM_LT_L2x2_END: + + +DSTRM_LT_L2x1_BEGIN: + + andi. T1, M, 1 + ble DSTRM_LT_L2x1_END + + mr BO, B + + +DSTRM_LT_L2x1_LOOP_START: + + + INIT_1x2 + + + addic. L, KK, 0 + ble DSTRM_LT_L2x1_SAVE + +DSTRM_LT_L2x1_LOOP: + + + KERNEL_1x2 + + addic. L, L, -1 + bgt DSTRM_LT_L2x1_LOOP + + +DSTRM_LT_L2x1_SAVE: + + SOLVE_LT_1x2 + + addi CO, CO, 1*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 0+BASE_SHIFT + slwi T4, T4, 1+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 1 + +DSTRM_LT_L2x1_END: + + slwi T1, K, 1+BASE_SHIFT + add B, B, T1 + +DSTRM_LT_L2_END: + +DSTRM_LT_L1_BEGIN: + + andi. T1, N, 1 + ble DSTRM_LT_L1_END + + mr CO, C + mr AO, A + + mr KK, OFFSET + srawi. I, M, 4 + ble DSTRM_LT_L1x16_END + + +DSTRM_LT_L1x16_BEGIN: + + mr BO, B + + +DSTRM_LT_L1x16_LOOP_START: + + + INIT_16x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x16_SAVE + +DSTRM_LT_L1x16_LOOP: + + + KERNEL_16x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x16_LOOP + + +DSTRM_LT_L1x16_SAVE: + + SOLVE_LT_16x1 + + addi CO, CO, 16*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 4+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 16 + + addic. I, I, -1 + bgt DSTRM_LT_L1x16_BEGIN + +DSTRM_LT_L1x16_END: + + +DSTRM_LT_L1x8_BEGIN: + + andi. T1, M, 8 + ble DSTRM_LT_L1x8_END + + mr BO, B + + +DSTRM_LT_L1x8_LOOP_START: + + + INIT_8x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x8_SAVE + +DSTRM_LT_L1x8_LOOP: + + + KERNEL_8x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x8_LOOP + + +DSTRM_LT_L1x8_SAVE: + + SOLVE_LT_8x1 + + addi CO, CO, 8*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 3+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 8 + +DSTRM_LT_L1x8_END: + + +DSTRM_LT_L1x4_BEGIN: + + andi. T1, M, 4 + ble DSTRM_LT_L1x4_END + + mr BO, B + + +DSTRM_LT_L1x4_LOOP_START: + + + INIT_4x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x4_SAVE + +DSTRM_LT_L1x4_LOOP: + + + KERNEL_4x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x4_LOOP + + +DSTRM_LT_L1x4_SAVE: + + SOLVE_LT_4x1 + + addi CO, CO, 4*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 2+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 4 + +DSTRM_LT_L1x4_END: + + +DSTRM_LT_L1x2_BEGIN: + + andi. T1, M, 2 + ble DSTRM_LT_L1x2_END + + mr BO, B + + +DSTRM_LT_L1x2_LOOP_START: + + + INIT_2x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x2_SAVE + +DSTRM_LT_L1x2_LOOP: + + + KERNEL_2x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x2_LOOP + + +DSTRM_LT_L1x2_SAVE: + + SOLVE_LT_2x1 + + addi CO, CO, 2*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 1+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 2 + +DSTRM_LT_L1x2_END: + + +DSTRM_LT_L1x1_BEGIN: + + andi. T1, M, 1 + ble DSTRM_LT_L1x1_END + + mr BO, B + + +DSTRM_LT_L1x1_LOOP_START: + + + INIT_1x1 + + + addic. L, KK, 0 + ble DSTRM_LT_L1x1_SAVE + +DSTRM_LT_L1x1_LOOP: + + + KERNEL_1x1 + + addic. L, L, -1 + bgt DSTRM_LT_L1x1_LOOP + + +DSTRM_LT_L1x1_SAVE: + + SOLVE_LT_1x1 + + addi CO, CO, 1*SIZE + + sub T3, K, KK + sub T4, K, KK + slwi T3, T3, 0+BASE_SHIFT + slwi T4, T4, 0+BASE_SHIFT + add AO, AO, T3 + add BO, BO, T4 + addi KK, KK, 1 + +DSTRM_LT_L1x1_END: + +DSTRM_LT_L1_END: diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S new file mode 100644 index 000000000..14e8402c9 --- /dev/null +++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S @@ -0,0 +1,4623 @@ + +.macro INIT_16x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + xvmovdp vs48, vs0 + xvmovdp vs49, vs0 + xvmovdp vs50, vs0 + xvmovdp vs51, vs0 + xvmovdp vs52, vs0 + xvmovdp vs53, vs0 + xvmovdp vs54, vs0 + xvmovdp vs55, vs0 + xvmovdp vs56, vs0 + xvmovdp vs57, vs0 + xvmovdp vs58, vs0 + xvmovdp vs59, vs0 + xvmovdp vs60, vs0 + xvmovdp vs61, vs0 + xvmovdp vs62, vs0 + xvmovdp vs63, vs0 + +.endm + + +.macro KERNEL_16x4 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + xvmaddadp vs36, vs1, vs16 + xvmaddadp vs37, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 + xvmaddadp vs40, vs2, vs16 + xvmaddadp vs41, vs2, vs17 + xvmaddadp vs42, vs2, vs18 + xvmaddadp vs43, vs2, vs19 + xvmaddadp vs44, vs3, vs16 + xvmaddadp vs45, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 + xvmaddadp vs48, vs4, vs16 + xvmaddadp vs49, vs4, vs17 + xvmaddadp vs50, vs4, vs18 + xvmaddadp vs51, vs4, vs19 + xvmaddadp vs52, vs5, vs16 + xvmaddadp vs53, vs5, vs17 + xvmaddadp vs54, vs5, vs18 + xvmaddadp vs55, vs5, vs19 + xvmaddadp vs56, vs6, vs16 + xvmaddadp vs57, vs6, vs17 + xvmaddadp vs58, vs6, vs18 + xvmaddadp vs59, vs6, vs19 + xvmaddadp vs60, vs7, vs16 + xvmaddadp vs61, vs7, vs17 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs63, vs7, vs19 + + +.endm + + +.macro INIT_8x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + +.endm + + +.macro KERNEL_8x4 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + xvmaddadp vs36, vs1, vs16 + xvmaddadp vs37, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 + xvmaddadp vs40, vs2, vs16 + xvmaddadp vs41, vs2, vs17 + xvmaddadp vs42, vs2, vs18 + xvmaddadp vs43, vs2, vs19 + xvmaddadp vs44, vs3, vs16 + xvmaddadp vs45, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 + + +.endm + + +.macro INIT_4x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + +.endm + + +.macro KERNEL_4x4 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + xvmaddadp vs36, vs1, vs16 + xvmaddadp vs37, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 + + +.endm + + +.macro INIT_2x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_2x4 + + + lxvd2x vs0, o0, AO + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + + +.endm + + +.macro INIT_1x4 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_1x4 + + + lxvdsx vs0, o0, AO + + addi AO, AO, 8 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 + + +.endm + + +/*########################################################################################## + SOLVE_LT 16x4 +##########################################################################################*/ + +.macro SOLVE_LT_16x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs38, vs39, 0 + xxpermdi vs6, vs36, vs37, 3 + xxpermdi vs7, vs38, vs39, 3 + + xxpermdi vs8, vs40, vs41, 0 + xxpermdi vs9, vs42, vs43, 0 + xxpermdi vs10, vs40, vs41, 3 + xxpermdi vs11, vs42, vs43, 3 + + xxpermdi vs12, vs44, vs45, 0 + xxpermdi vs13, vs46, vs47, 0 + xxpermdi vs14, vs44, vs45, 3 + xxpermdi vs15, vs46, vs47, 3 + + xxpermdi vs16, vs48, vs49, 0 + xxpermdi vs17, vs50, vs51, 0 + xxpermdi vs18, vs48, vs49, 3 + xxpermdi vs19, vs50, vs51, 3 + + xxpermdi vs20, vs52, vs53, 0 + xxpermdi vs21, vs54, vs55, 0 + xxpermdi vs22, vs52, vs53, 3 + xxpermdi vs23, vs54, vs55, 3 + + xxpermdi vs24, vs56, vs57, 0 + xxpermdi vs25, vs58, vs59, 0 + xxpermdi vs26, vs56, vs57, 3 + xxpermdi vs27, vs58, vs59, 3 + + xxpermdi vs28, vs60, vs61, 0 + xxpermdi vs29, vs62, vs63, 0 + xxpermdi vs30, vs60, vs61, 3 + xxpermdi vs31, vs62, vs63, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs40, o0, T1 + lxvd2x vs41, o16, T1 + lxvd2x vs42, o32, T1 + lxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs44, o0, T1 + lxvd2x vs45, o16, T1 + lxvd2x vs46, o32, T1 + lxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs48, o0, T1 + lxvd2x vs49, o16, T1 + lxvd2x vs50, o32, T1 + lxvd2x vs51, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs52, o0, T1 + lxvd2x vs53, o16, T1 + lxvd2x vs54, o32, T1 + lxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs56, o0, T1 + lxvd2x vs57, o16, T1 + lxvd2x vs58, o32, T1 + lxvd2x vs59, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs60, o0, T1 + lxvd2x vs61, o16, T1 + lxvd2x vs62, o32, T1 + lxvd2x vs63, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + xvsubdp vs40, vs40, vs8 + xvsubdp vs41, vs41, vs9 + xvsubdp vs42, vs42, vs10 + xvsubdp vs43, vs43, vs11 + xvsubdp vs44, vs44, vs12 + xvsubdp vs45, vs45, vs13 + xvsubdp vs46, vs46, vs14 + xvsubdp vs47, vs47, vs15 + xvsubdp vs48, vs48, vs16 + xvsubdp vs49, vs49, vs17 + xvsubdp vs50, vs50, vs18 + xvsubdp vs51, vs51, vs19 + xvsubdp vs52, vs52, vs20 + xvsubdp vs53, vs53, vs21 + xvsubdp vs54, vs54, vs22 + xvsubdp vs55, vs55, vs23 + xvsubdp vs56, vs56, vs24 + xvsubdp vs57, vs57, vs25 + xvsubdp vs58, vs58, vs26 + xvsubdp vs59, vs59, vs27 + xvsubdp vs60, vs60, vs28 + xvsubdp vs61, vs61, vs29 + xvsubdp vs62, vs62, vs30 + xvsubdp vs63, vs63, vs31 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + lxvdsx vs15, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + xvnmsubadp vs36, vs32, vs2 + xvnmsubadp vs37, vs33, vs2 + xvnmsubadp vs38, vs32, vs3 + xvnmsubadp vs39, vs33, vs3 + xvnmsubadp vs40, vs32, vs4 + xvnmsubadp vs41, vs33, vs4 + xvnmsubadp vs42, vs32, vs5 + xvnmsubadp vs43, vs33, vs5 + xvnmsubadp vs44, vs32, vs6 + xvnmsubadp vs45, vs33, vs6 + xvnmsubadp vs46, vs32, vs7 + xvnmsubadp vs47, vs33, vs7 + xvnmsubadp vs48, vs32, vs8 + xvnmsubadp vs49, vs33, vs8 + xvnmsubadp vs50, vs32, vs9 + xvnmsubadp vs51, vs33, vs9 + xvnmsubadp vs52, vs32, vs10 + xvnmsubadp vs53, vs33, vs10 + xvnmsubadp vs54, vs32, vs11 + xvnmsubadp vs55, vs33, vs11 + xvnmsubadp vs56, vs32, vs12 + xvnmsubadp vs57, vs33, vs12 + xvnmsubadp vs58, vs32, vs13 + xvnmsubadp vs59, vs33, vs13 + xvnmsubadp vs60, vs32, vs14 + xvnmsubadp vs61, vs33, vs14 + xvnmsubadp vs62, vs32, vs15 + xvnmsubadp vs63, vs33, vs15 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + xvnmsubadp vs36, vs34, vs1 + xvnmsubadp vs37, vs35, vs1 + xvnmsubadp vs38, vs34, vs2 + xvnmsubadp vs39, vs35, vs2 + xvnmsubadp vs40, vs34, vs3 + xvnmsubadp vs41, vs35, vs3 + xvnmsubadp vs42, vs34, vs4 + xvnmsubadp vs43, vs35, vs4 + xvnmsubadp vs44, vs34, vs5 + xvnmsubadp vs45, vs35, vs5 + xvnmsubadp vs46, vs34, vs6 + xvnmsubadp vs47, vs35, vs6 + xvnmsubadp vs48, vs34, vs7 + xvnmsubadp vs49, vs35, vs7 + xvnmsubadp vs50, vs34, vs8 + xvnmsubadp vs51, vs35, vs8 + xvnmsubadp vs52, vs34, vs9 + xvnmsubadp vs53, vs35, vs9 + xvnmsubadp vs54, vs34, vs10 + xvnmsubadp vs55, vs35, vs10 + xvnmsubadp vs56, vs34, vs11 + xvnmsubadp vs57, vs35, vs11 + xvnmsubadp vs58, vs34, vs12 + xvnmsubadp vs59, vs35, vs12 + xvnmsubadp vs60, vs34, vs13 + xvnmsubadp vs61, vs35, vs13 + xvnmsubadp vs62, vs34, vs14 + xvnmsubadp vs63, vs35, vs14 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs36, vs36, vs0 + xvmuldp vs37, vs37, vs0 + + xvnmsubadp vs38, vs36, vs1 + xvnmsubadp vs39, vs37, vs1 + xvnmsubadp vs40, vs36, vs2 + xvnmsubadp vs41, vs37, vs2 + xvnmsubadp vs42, vs36, vs3 + xvnmsubadp vs43, vs37, vs3 + xvnmsubadp vs44, vs36, vs4 + xvnmsubadp vs45, vs37, vs4 + xvnmsubadp vs46, vs36, vs5 + xvnmsubadp vs47, vs37, vs5 + xvnmsubadp vs48, vs36, vs6 + xvnmsubadp vs49, vs37, vs6 + xvnmsubadp vs50, vs36, vs7 + xvnmsubadp vs51, vs37, vs7 + xvnmsubadp vs52, vs36, vs8 + xvnmsubadp vs53, vs37, vs8 + xvnmsubadp vs54, vs36, vs9 + xvnmsubadp vs55, vs37, vs9 + xvnmsubadp vs56, vs36, vs10 + xvnmsubadp vs57, vs37, vs10 + xvnmsubadp vs58, vs36, vs11 + xvnmsubadp vs59, vs37, vs11 + xvnmsubadp vs60, vs36, vs12 + xvnmsubadp vs61, vs37, vs12 + xvnmsubadp vs62, vs36, vs13 + xvnmsubadp vs63, vs37, vs13 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs38, vs38, vs0 + xvmuldp vs39, vs39, vs0 + + xvnmsubadp vs40, vs38, vs1 + xvnmsubadp vs41, vs39, vs1 + xvnmsubadp vs42, vs38, vs2 + xvnmsubadp vs43, vs39, vs2 + xvnmsubadp vs44, vs38, vs3 + xvnmsubadp vs45, vs39, vs3 + xvnmsubadp vs46, vs38, vs4 + xvnmsubadp vs47, vs39, vs4 + xvnmsubadp vs48, vs38, vs5 + xvnmsubadp vs49, vs39, vs5 + xvnmsubadp vs50, vs38, vs6 + xvnmsubadp vs51, vs39, vs6 + xvnmsubadp vs52, vs38, vs7 + xvnmsubadp vs53, vs39, vs7 + xvnmsubadp vs54, vs38, vs8 + xvnmsubadp vs55, vs39, vs8 + xvnmsubadp vs56, vs38, vs9 + xvnmsubadp vs57, vs39, vs9 + xvnmsubadp vs58, vs38, vs10 + xvnmsubadp vs59, vs39, vs10 + xvnmsubadp vs60, vs38, vs11 + xvnmsubadp vs61, vs39, vs11 + xvnmsubadp vs62, vs38, vs12 + xvnmsubadp vs63, vs39, vs12 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs40, vs40, vs0 + xvmuldp vs41, vs41, vs0 + + xvnmsubadp vs42, vs40, vs1 + xvnmsubadp vs43, vs41, vs1 + xvnmsubadp vs44, vs40, vs2 + xvnmsubadp vs45, vs41, vs2 + xvnmsubadp vs46, vs40, vs3 + xvnmsubadp vs47, vs41, vs3 + xvnmsubadp vs48, vs40, vs4 + xvnmsubadp vs49, vs41, vs4 + xvnmsubadp vs50, vs40, vs5 + xvnmsubadp vs51, vs41, vs5 + xvnmsubadp vs52, vs40, vs6 + xvnmsubadp vs53, vs41, vs6 + xvnmsubadp vs54, vs40, vs7 + xvnmsubadp vs55, vs41, vs7 + xvnmsubadp vs56, vs40, vs8 + xvnmsubadp vs57, vs41, vs8 + xvnmsubadp vs58, vs40, vs9 + xvnmsubadp vs59, vs41, vs9 + xvnmsubadp vs60, vs40, vs10 + xvnmsubadp vs61, vs41, vs10 + xvnmsubadp vs62, vs40, vs11 + xvnmsubadp vs63, vs41, vs11 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs42, vs42, vs0 + xvmuldp vs43, vs43, vs0 + + xvnmsubadp vs44, vs42, vs1 + xvnmsubadp vs45, vs43, vs1 + xvnmsubadp vs46, vs42, vs2 + xvnmsubadp vs47, vs43, vs2 + xvnmsubadp vs48, vs42, vs3 + xvnmsubadp vs49, vs43, vs3 + xvnmsubadp vs50, vs42, vs4 + xvnmsubadp vs51, vs43, vs4 + xvnmsubadp vs52, vs42, vs5 + xvnmsubadp vs53, vs43, vs5 + xvnmsubadp vs54, vs42, vs6 + xvnmsubadp vs55, vs43, vs6 + xvnmsubadp vs56, vs42, vs7 + xvnmsubadp vs57, vs43, vs7 + xvnmsubadp vs58, vs42, vs8 + xvnmsubadp vs59, vs43, vs8 + xvnmsubadp vs60, vs42, vs9 + xvnmsubadp vs61, vs43, vs9 + xvnmsubadp vs62, vs42, vs10 + xvnmsubadp vs63, vs43, vs10 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs44, vs44, vs0 + xvmuldp vs45, vs45, vs0 + + xvnmsubadp vs46, vs44, vs1 + xvnmsubadp vs47, vs45, vs1 + xvnmsubadp vs48, vs44, vs2 + xvnmsubadp vs49, vs45, vs2 + xvnmsubadp vs50, vs44, vs3 + xvnmsubadp vs51, vs45, vs3 + xvnmsubadp vs52, vs44, vs4 + xvnmsubadp vs53, vs45, vs4 + xvnmsubadp vs54, vs44, vs5 + xvnmsubadp vs55, vs45, vs5 + xvnmsubadp vs56, vs44, vs6 + xvnmsubadp vs57, vs45, vs6 + xvnmsubadp vs58, vs44, vs7 + xvnmsubadp vs59, vs45, vs7 + xvnmsubadp vs60, vs44, vs8 + xvnmsubadp vs61, vs45, vs8 + xvnmsubadp vs62, vs44, vs9 + xvnmsubadp vs63, vs45, vs9 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs46, vs46, vs0 + xvmuldp vs47, vs47, vs0 + + xvnmsubadp vs48, vs46, vs1 + xvnmsubadp vs49, vs47, vs1 + xvnmsubadp vs50, vs46, vs2 + xvnmsubadp vs51, vs47, vs2 + xvnmsubadp vs52, vs46, vs3 + xvnmsubadp vs53, vs47, vs3 + xvnmsubadp vs54, vs46, vs4 + xvnmsubadp vs55, vs47, vs4 + xvnmsubadp vs56, vs46, vs5 + xvnmsubadp vs57, vs47, vs5 + xvnmsubadp vs58, vs46, vs6 + xvnmsubadp vs59, vs47, vs6 + xvnmsubadp vs60, vs46, vs7 + xvnmsubadp vs61, vs47, vs7 + xvnmsubadp vs62, vs46, vs8 + xvnmsubadp vs63, vs47, vs8 + +//############### OFFSET 8 ####################### + + addi T1, T1, 8*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs48, vs48, vs0 + xvmuldp vs49, vs49, vs0 + + xvnmsubadp vs50, vs48, vs1 + xvnmsubadp vs51, vs49, vs1 + xvnmsubadp vs52, vs48, vs2 + xvnmsubadp vs53, vs49, vs2 + xvnmsubadp vs54, vs48, vs3 + xvnmsubadp vs55, vs49, vs3 + xvnmsubadp vs56, vs48, vs4 + xvnmsubadp vs57, vs49, vs4 + xvnmsubadp vs58, vs48, vs5 + xvnmsubadp vs59, vs49, vs5 + xvnmsubadp vs60, vs48, vs6 + xvnmsubadp vs61, vs49, vs6 + xvnmsubadp vs62, vs48, vs7 + xvnmsubadp vs63, vs49, vs7 + +//############### OFFSET 9 ####################### + + addi T1, T1, 9*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs50, vs50, vs0 + xvmuldp vs51, vs51, vs0 + + xvnmsubadp vs52, vs50, vs1 + xvnmsubadp vs53, vs51, vs1 + xvnmsubadp vs54, vs50, vs2 + xvnmsubadp vs55, vs51, vs2 + xvnmsubadp vs56, vs50, vs3 + xvnmsubadp vs57, vs51, vs3 + xvnmsubadp vs58, vs50, vs4 + xvnmsubadp vs59, vs51, vs4 + xvnmsubadp vs60, vs50, vs5 + xvnmsubadp vs61, vs51, vs5 + xvnmsubadp vs62, vs50, vs6 + xvnmsubadp vs63, vs51, vs6 + +//############### OFFSET 10 ####################### + + addi T1, T1, 10*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs52, vs52, vs0 + xvmuldp vs53, vs53, vs0 + + xvnmsubadp vs54, vs52, vs1 + xvnmsubadp vs55, vs53, vs1 + xvnmsubadp vs56, vs52, vs2 + xvnmsubadp vs57, vs53, vs2 + xvnmsubadp vs58, vs52, vs3 + xvnmsubadp vs59, vs53, vs3 + xvnmsubadp vs60, vs52, vs4 + xvnmsubadp vs61, vs53, vs4 + xvnmsubadp vs62, vs52, vs5 + xvnmsubadp vs63, vs53, vs5 + +//############### OFFSET 11 ####################### + + addi T1, T1, 11*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs54, vs54, vs0 + xvmuldp vs55, vs55, vs0 + + xvnmsubadp vs56, vs54, vs1 + xvnmsubadp vs57, vs55, vs1 + xvnmsubadp vs58, vs54, vs2 + xvnmsubadp vs59, vs55, vs2 + xvnmsubadp vs60, vs54, vs3 + xvnmsubadp vs61, vs55, vs3 + xvnmsubadp vs62, vs54, vs4 + xvnmsubadp vs63, vs55, vs4 + +//############### OFFSET 12 ####################### + + addi T1, T1, 12*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs56, vs56, vs0 + xvmuldp vs57, vs57, vs0 + + xvnmsubadp vs58, vs56, vs1 + xvnmsubadp vs59, vs57, vs1 + xvnmsubadp vs60, vs56, vs2 + xvnmsubadp vs61, vs57, vs2 + xvnmsubadp vs62, vs56, vs3 + xvnmsubadp vs63, vs57, vs3 + +//############### OFFSET 13 ####################### + + addi T1, T1, 13*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs58, vs58, vs0 + xvmuldp vs59, vs59, vs0 + + xvnmsubadp vs60, vs58, vs1 + xvnmsubadp vs61, vs59, vs1 + xvnmsubadp vs62, vs58, vs2 + xvnmsubadp vs63, vs59, vs2 + +//############### OFFSET 14 ####################### + + addi T1, T1, 14*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs60, vs60, vs0 + xvmuldp vs61, vs61, vs0 + + xvnmsubadp vs62, vs60, vs1 + xvnmsubadp vs63, vs61, vs1 + +//############### OFFSET 15 ####################### + + addi T1, T1, 15*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs62, vs62, vs0 + xvmuldp vs63, vs63, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs52, o0, T1 + stxvd2x vs53, o16, T1 + stxvd2x vs54, o32, T1 + stxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs56, o0, T1 + stxvd2x vs57, o16, T1 + stxvd2x vs58, o32, T1 + stxvd2x vs59, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs60, o0, T1 + stxvd2x vs61, o16, T1 + stxvd2x vs62, o32, T1 + stxvd2x vs63, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + stxsdx vs36, o16, T1 + xxswapd vs36, vs36 + stxsdx vs38, o24, T1 + xxswapd vs38, vs38 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + xxswapd vs40, vs40 + stxsdx vs42, o8, T1 + xxswapd vs42, vs42 + stxsdx vs44, o16, T1 + xxswapd vs44, vs44 + stxsdx vs46, o24, T1 + xxswapd vs46, vs46 + + addi T1, T1, 32 + + stxsdx vs48, o0, T1 + xxswapd vs48, vs48 + stxsdx vs50, o8, T1 + xxswapd vs50, vs50 + stxsdx vs52, o16, T1 + xxswapd vs52, vs52 + stxsdx vs54, o24, T1 + xxswapd vs54, vs54 + + addi T1, T1, 32 + + stxsdx vs56, o0, T1 + xxswapd vs56, vs56 + stxsdx vs58, o8, T1 + xxswapd vs58, vs58 + stxsdx vs60, o16, T1 + xxswapd vs60, vs60 + stxsdx vs62, o24, T1 + xxswapd vs62, vs62 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + stxsdx vs36, o16, T2 + stxsdx vs38, o24, T2 + + addi T2, T2, 32 + + stxsdx vs40, o0, T2 + stxsdx vs42, o8, T2 + stxsdx vs44, o16, T2 + stxsdx vs46, o24, T2 + + addi T2, T2, 32 + + stxsdx vs48, o0, T2 + stxsdx vs50, o8, T2 + stxsdx vs52, o16, T2 + stxsdx vs54, o24, T2 + + addi T2, T2, 32 + + stxsdx vs56, o0, T2 + stxsdx vs58, o8, T2 + stxsdx vs60, o16, T2 + stxsdx vs62, o24, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + stxsdx vs37, o16, T1 + xxswapd vs37, vs37 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + addi T1, T1, 32 + + stxsdx vs41, o0, T1 + xxswapd vs41, vs41 + stxsdx vs43, o8, T1 + xxswapd vs43, vs43 + stxsdx vs45, o16, T1 + xxswapd vs45, vs45 + stxsdx vs47, o24, T1 + xxswapd vs47, vs47 + + addi T1, T1, 32 + + stxsdx vs49, o0, T1 + xxswapd vs49, vs49 + stxsdx vs51, o8, T1 + xxswapd vs51, vs51 + stxsdx vs53, o16, T1 + xxswapd vs53, vs53 + stxsdx vs55, o24, T1 + xxswapd vs55, vs55 + + addi T1, T1, 32 + + stxsdx vs57, o0, T1 + xxswapd vs57, vs57 + stxsdx vs59, o8, T1 + xxswapd vs59, vs59 + stxsdx vs61, o16, T1 + xxswapd vs61, vs61 + stxsdx vs63, o24, T1 + xxswapd vs63, vs63 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + stxsdx vs37, o16, T2 + stxsdx vs39, o24, T2 + + addi T2, T2, 32 + + stxsdx vs41, o0, T2 + stxsdx vs43, o8, T2 + stxsdx vs45, o16, T2 + stxsdx vs47, o24, T2 + + addi T2, T2, 32 + + stxsdx vs49, o0, T2 + stxsdx vs51, o8, T2 + stxsdx vs53, o16, T2 + stxsdx vs55, o24, T2 + + addi T2, T2, 32 + + stxsdx vs57, o0, T2 + stxsdx vs59, o8, T2 + stxsdx vs61, o16, T2 + stxsdx vs63, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 8x4 +##########################################################################################*/ + +.macro SOLVE_LT_8x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs38, vs39, 0 + xxpermdi vs6, vs36, vs37, 3 + xxpermdi vs7, vs38, vs39, 3 + + xxpermdi vs8, vs40, vs41, 0 + xxpermdi vs9, vs42, vs43, 0 + xxpermdi vs10, vs40, vs41, 3 + xxpermdi vs11, vs42, vs43, 3 + + xxpermdi vs12, vs44, vs45, 0 + xxpermdi vs13, vs46, vs47, 0 + xxpermdi vs14, vs44, vs45, 3 + xxpermdi vs15, vs46, vs47, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs40, o0, T1 + lxvd2x vs41, o16, T1 + lxvd2x vs42, o32, T1 + lxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs44, o0, T1 + lxvd2x vs45, o16, T1 + lxvd2x vs46, o32, T1 + lxvd2x vs47, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + xvsubdp vs40, vs40, vs8 + xvsubdp vs41, vs41, vs9 + xvsubdp vs42, vs42, vs10 + xvsubdp vs43, vs43, vs11 + xvsubdp vs44, vs44, vs12 + xvsubdp vs45, vs45, vs13 + xvsubdp vs46, vs46, vs14 + xvsubdp vs47, vs47, vs15 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + xvnmsubadp vs36, vs32, vs2 + xvnmsubadp vs37, vs33, vs2 + xvnmsubadp vs38, vs32, vs3 + xvnmsubadp vs39, vs33, vs3 + xvnmsubadp vs40, vs32, vs4 + xvnmsubadp vs41, vs33, vs4 + xvnmsubadp vs42, vs32, vs5 + xvnmsubadp vs43, vs33, vs5 + xvnmsubadp vs44, vs32, vs6 + xvnmsubadp vs45, vs33, vs6 + xvnmsubadp vs46, vs32, vs7 + xvnmsubadp vs47, vs33, vs7 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + xvnmsubadp vs36, vs34, vs1 + xvnmsubadp vs37, vs35, vs1 + xvnmsubadp vs38, vs34, vs2 + xvnmsubadp vs39, vs35, vs2 + xvnmsubadp vs40, vs34, vs3 + xvnmsubadp vs41, vs35, vs3 + xvnmsubadp vs42, vs34, vs4 + xvnmsubadp vs43, vs35, vs4 + xvnmsubadp vs44, vs34, vs5 + xvnmsubadp vs45, vs35, vs5 + xvnmsubadp vs46, vs34, vs6 + xvnmsubadp vs47, vs35, vs6 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs36, vs36, vs0 + xvmuldp vs37, vs37, vs0 + + xvnmsubadp vs38, vs36, vs1 + xvnmsubadp vs39, vs37, vs1 + xvnmsubadp vs40, vs36, vs2 + xvnmsubadp vs41, vs37, vs2 + xvnmsubadp vs42, vs36, vs3 + xvnmsubadp vs43, vs37, vs3 + xvnmsubadp vs44, vs36, vs4 + xvnmsubadp vs45, vs37, vs4 + xvnmsubadp vs46, vs36, vs5 + xvnmsubadp vs47, vs37, vs5 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs38, vs38, vs0 + xvmuldp vs39, vs39, vs0 + + xvnmsubadp vs40, vs38, vs1 + xvnmsubadp vs41, vs39, vs1 + xvnmsubadp vs42, vs38, vs2 + xvnmsubadp vs43, vs39, vs2 + xvnmsubadp vs44, vs38, vs3 + xvnmsubadp vs45, vs39, vs3 + xvnmsubadp vs46, vs38, vs4 + xvnmsubadp vs47, vs39, vs4 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs40, vs40, vs0 + xvmuldp vs41, vs41, vs0 + + xvnmsubadp vs42, vs40, vs1 + xvnmsubadp vs43, vs41, vs1 + xvnmsubadp vs44, vs40, vs2 + xvnmsubadp vs45, vs41, vs2 + xvnmsubadp vs46, vs40, vs3 + xvnmsubadp vs47, vs41, vs3 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs42, vs42, vs0 + xvmuldp vs43, vs43, vs0 + + xvnmsubadp vs44, vs42, vs1 + xvnmsubadp vs45, vs43, vs1 + xvnmsubadp vs46, vs42, vs2 + xvnmsubadp vs47, vs43, vs2 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs44, vs44, vs0 + xvmuldp vs45, vs45, vs0 + + xvnmsubadp vs46, vs44, vs1 + xvnmsubadp vs47, vs45, vs1 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs46, vs46, vs0 + xvmuldp vs47, vs47, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + stxsdx vs36, o16, T1 + xxswapd vs36, vs36 + stxsdx vs38, o24, T1 + xxswapd vs38, vs38 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + xxswapd vs40, vs40 + stxsdx vs42, o8, T1 + xxswapd vs42, vs42 + stxsdx vs44, o16, T1 + xxswapd vs44, vs44 + stxsdx vs46, o24, T1 + xxswapd vs46, vs46 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + stxsdx vs36, o16, T2 + stxsdx vs38, o24, T2 + + addi T2, T2, 32 + + stxsdx vs40, o0, T2 + stxsdx vs42, o8, T2 + stxsdx vs44, o16, T2 + stxsdx vs46, o24, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + stxsdx vs37, o16, T1 + xxswapd vs37, vs37 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + addi T1, T1, 32 + + stxsdx vs41, o0, T1 + xxswapd vs41, vs41 + stxsdx vs43, o8, T1 + xxswapd vs43, vs43 + stxsdx vs45, o16, T1 + xxswapd vs45, vs45 + stxsdx vs47, o24, T1 + xxswapd vs47, vs47 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + stxsdx vs37, o16, T2 + stxsdx vs39, o24, T2 + + addi T2, T2, 32 + + stxsdx vs41, o0, T2 + stxsdx vs43, o8, T2 + stxsdx vs45, o16, T2 + stxsdx vs47, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 4x4 +##########################################################################################*/ + +.macro SOLVE_LT_4x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs38, vs39, 0 + xxpermdi vs6, vs36, vs37, 3 + xxpermdi vs7, vs38, vs39, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + xvnmsubadp vs36, vs32, vs2 + xvnmsubadp vs37, vs33, vs2 + xvnmsubadp vs38, vs32, vs3 + xvnmsubadp vs39, vs33, vs3 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + xvnmsubadp vs36, vs34, vs1 + xvnmsubadp vs37, vs35, vs1 + xvnmsubadp vs38, vs34, vs2 + xvnmsubadp vs39, vs35, vs2 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs36, vs36, vs0 + xvmuldp vs37, vs37, vs0 + + xvnmsubadp vs38, vs36, vs1 + xvnmsubadp vs39, vs37, vs1 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs38, vs38, vs0 + xvmuldp vs39, vs39, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + stxsdx vs36, o16, T1 + xxswapd vs36, vs36 + stxsdx vs38, o24, T1 + xxswapd vs38, vs38 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + stxsdx vs36, o16, T2 + stxsdx vs38, o24, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + stxsdx vs37, o16, T1 + xxswapd vs37, vs37 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + stxsdx vs37, o16, T2 + stxsdx vs39, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 2x4 +##########################################################################################*/ + +.macro SOLVE_LT_2x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + xxpermdi vs2, vs32, vs33, 3 + xxpermdi vs3, vs34, vs35, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + xvnmsubadp vs34, vs32, vs1 + xvnmsubadp vs35, vs33, vs1 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs34, vs34, vs0 + xvmuldp vs35, vs35, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs34, o8, T1 + xxswapd vs34, vs34 + + stxsdx vs32, o0, T2 + stxsdx vs34, o8, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + stxsdx vs35, o8, T1 + xxswapd vs35, vs35 + + stxsdx vs33, o0, T2 + stxsdx vs35, o8, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 1x4 +##########################################################################################*/ + +.macro SOLVE_LT_1x4 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs34, vs35, 0 + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs32, vs32, vs0 + xvmuldp vs33, vs33, vs0 + + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + + stxsdx vs32, o0, T2 + + mr T1, CO + add T2, CO, LDC + + + add T1, T2, LDC + add T2, T1, LDC + + + stxsdx vs33, o0, T1 + xxswapd vs33, vs33 + + stxsdx vs33, o0, T2 + +.endm + + +.macro INIT_16x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + +.endm + + +.macro KERNEL_16x2 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + + +.endm + + +.macro INIT_8x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + +.endm + + +.macro KERNEL_8x2 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + +.endm + + +.macro INIT_4x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_4x2 + + + lxvd2x vs0, o0, AO + lxvd2x vs1, o16, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + +.endm + + +.macro INIT_2x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + +.endm + + +.macro KERNEL_2x2 + + + lxvd2x vs0, o0, AO + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + +.endm + + +.macro INIT_1x2 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + +.endm + + +.macro KERNEL_1x2 + + + lxvdsx vs0, o0, AO + + addi AO, AO, 8 + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + +.endm + + +/*########################################################################################## + SOLVE_LT 16x2 +##########################################################################################*/ + +.macro SOLVE_LT_16x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + xxpermdi vs2, vs34, vs35, 0 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs36, vs37, 3 + + xxpermdi vs6, vs38, vs39, 0 + xxpermdi vs7, vs38, vs39, 3 + + xxpermdi vs8, vs40, vs41, 0 + xxpermdi vs9, vs40, vs41, 3 + + xxpermdi vs10, vs42, vs43, 0 + xxpermdi vs11, vs42, vs43, 3 + + xxpermdi vs12, vs44, vs45, 0 + xxpermdi vs13, vs44, vs45, 3 + + xxpermdi vs14, vs46, vs47, 0 + xxpermdi vs15, vs46, vs47, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs40, o0, T1 + lxvd2x vs41, o16, T1 + lxvd2x vs42, o32, T1 + lxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs44, o0, T1 + lxvd2x vs45, o16, T1 + lxvd2x vs46, o32, T1 + lxvd2x vs47, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + xvsubdp vs40, vs40, vs8 + xvsubdp vs41, vs41, vs9 + xvsubdp vs42, vs42, vs10 + xvsubdp vs43, vs43, vs11 + xvsubdp vs44, vs44, vs12 + xvsubdp vs45, vs45, vs13 + xvsubdp vs46, vs46, vs14 + xvsubdp vs47, vs47, vs15 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + lxvdsx vs15, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + xvnmsubadp vs34, vs32, vs2 + xvnmsubadp vs35, vs32, vs3 + xvnmsubadp vs36, vs32, vs4 + xvnmsubadp vs37, vs32, vs5 + xvnmsubadp vs38, vs32, vs6 + xvnmsubadp vs39, vs32, vs7 + xvnmsubadp vs40, vs32, vs8 + xvnmsubadp vs41, vs32, vs9 + xvnmsubadp vs42, vs32, vs10 + xvnmsubadp vs43, vs32, vs11 + xvnmsubadp vs44, vs32, vs12 + xvnmsubadp vs45, vs32, vs13 + xvnmsubadp vs46, vs32, vs14 + xvnmsubadp vs47, vs32, vs15 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs33, vs33, vs0 + xvnmsubadp vs34, vs33, vs1 + xvnmsubadp vs35, vs33, vs2 + xvnmsubadp vs36, vs33, vs3 + xvnmsubadp vs37, vs33, vs4 + xvnmsubadp vs38, vs33, vs5 + xvnmsubadp vs39, vs33, vs6 + xvnmsubadp vs40, vs33, vs7 + xvnmsubadp vs41, vs33, vs8 + xvnmsubadp vs42, vs33, vs9 + xvnmsubadp vs43, vs33, vs10 + xvnmsubadp vs44, vs33, vs11 + xvnmsubadp vs45, vs33, vs12 + xvnmsubadp vs46, vs33, vs13 + xvnmsubadp vs47, vs33, vs14 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs34, vs34, vs0 + xvnmsubadp vs35, vs34, vs1 + xvnmsubadp vs36, vs34, vs2 + xvnmsubadp vs37, vs34, vs3 + xvnmsubadp vs38, vs34, vs4 + xvnmsubadp vs39, vs34, vs5 + xvnmsubadp vs40, vs34, vs6 + xvnmsubadp vs41, vs34, vs7 + xvnmsubadp vs42, vs34, vs8 + xvnmsubadp vs43, vs34, vs9 + xvnmsubadp vs44, vs34, vs10 + xvnmsubadp vs45, vs34, vs11 + xvnmsubadp vs46, vs34, vs12 + xvnmsubadp vs47, vs34, vs13 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs12, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs35, vs35, vs0 + xvnmsubadp vs36, vs35, vs1 + xvnmsubadp vs37, vs35, vs2 + xvnmsubadp vs38, vs35, vs3 + xvnmsubadp vs39, vs35, vs4 + xvnmsubadp vs40, vs35, vs5 + xvnmsubadp vs41, vs35, vs6 + xvnmsubadp vs42, vs35, vs7 + xvnmsubadp vs43, vs35, vs8 + xvnmsubadp vs44, vs35, vs9 + xvnmsubadp vs45, vs35, vs10 + xvnmsubadp vs46, vs35, vs11 + xvnmsubadp vs47, vs35, vs12 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs36, vs36, vs0 + xvnmsubadp vs37, vs36, vs1 + xvnmsubadp vs38, vs36, vs2 + xvnmsubadp vs39, vs36, vs3 + xvnmsubadp vs40, vs36, vs4 + xvnmsubadp vs41, vs36, vs5 + xvnmsubadp vs42, vs36, vs6 + xvnmsubadp vs43, vs36, vs7 + xvnmsubadp vs44, vs36, vs8 + xvnmsubadp vs45, vs36, vs9 + xvnmsubadp vs46, vs36, vs10 + xvnmsubadp vs47, vs36, vs11 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs37, vs37, vs0 + xvnmsubadp vs38, vs37, vs1 + xvnmsubadp vs39, vs37, vs2 + xvnmsubadp vs40, vs37, vs3 + xvnmsubadp vs41, vs37, vs4 + xvnmsubadp vs42, vs37, vs5 + xvnmsubadp vs43, vs37, vs6 + xvnmsubadp vs44, vs37, vs7 + xvnmsubadp vs45, vs37, vs8 + xvnmsubadp vs46, vs37, vs9 + xvnmsubadp vs47, vs37, vs10 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs38, vs38, vs0 + xvnmsubadp vs39, vs38, vs1 + xvnmsubadp vs40, vs38, vs2 + xvnmsubadp vs41, vs38, vs3 + xvnmsubadp vs42, vs38, vs4 + xvnmsubadp vs43, vs38, vs5 + xvnmsubadp vs44, vs38, vs6 + xvnmsubadp vs45, vs38, vs7 + xvnmsubadp vs46, vs38, vs8 + xvnmsubadp vs47, vs38, vs9 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs8, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs39, vs39, vs0 + xvnmsubadp vs40, vs39, vs1 + xvnmsubadp vs41, vs39, vs2 + xvnmsubadp vs42, vs39, vs3 + xvnmsubadp vs43, vs39, vs4 + xvnmsubadp vs44, vs39, vs5 + xvnmsubadp vs45, vs39, vs6 + xvnmsubadp vs46, vs39, vs7 + xvnmsubadp vs47, vs39, vs8 + +//############### OFFSET 8 ####################### + + addi T1, T1, 8*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs40, vs40, vs0 + xvnmsubadp vs41, vs40, vs1 + xvnmsubadp vs42, vs40, vs2 + xvnmsubadp vs43, vs40, vs3 + xvnmsubadp vs44, vs40, vs4 + xvnmsubadp vs45, vs40, vs5 + xvnmsubadp vs46, vs40, vs6 + xvnmsubadp vs47, vs40, vs7 + +//############### OFFSET 9 ####################### + + addi T1, T1, 9*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs41, vs41, vs0 + xvnmsubadp vs42, vs41, vs1 + xvnmsubadp vs43, vs41, vs2 + xvnmsubadp vs44, vs41, vs3 + xvnmsubadp vs45, vs41, vs4 + xvnmsubadp vs46, vs41, vs5 + xvnmsubadp vs47, vs41, vs6 + +//############### OFFSET 10 ####################### + + addi T1, T1, 10*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs42, vs42, vs0 + xvnmsubadp vs43, vs42, vs1 + xvnmsubadp vs44, vs42, vs2 + xvnmsubadp vs45, vs42, vs3 + xvnmsubadp vs46, vs42, vs4 + xvnmsubadp vs47, vs42, vs5 + +//############### OFFSET 11 ####################### + + addi T1, T1, 11*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs43, vs43, vs0 + xvnmsubadp vs44, vs43, vs1 + xvnmsubadp vs45, vs43, vs2 + xvnmsubadp vs46, vs43, vs3 + xvnmsubadp vs47, vs43, vs4 + +//############### OFFSET 12 ####################### + + addi T1, T1, 12*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs44, vs44, vs0 + xvnmsubadp vs45, vs44, vs1 + xvnmsubadp vs46, vs44, vs2 + xvnmsubadp vs47, vs44, vs3 + +//############### OFFSET 13 ####################### + + addi T1, T1, 13*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs45, vs45, vs0 + xvnmsubadp vs46, vs45, vs1 + xvnmsubadp vs47, vs45, vs2 + +//############### OFFSET 14 ####################### + + addi T1, T1, 14*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs46, vs46, vs0 + xvnmsubadp vs47, vs46, vs1 + +//############### OFFSET 15 ####################### + + addi T1, T1, 15*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs47, vs47, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + stxsdx vs34, o16, T1 + xxswapd vs34, vs34 + stxsdx vs35, o24, T1 + xxswapd vs35, vs35 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + xxswapd vs36, vs36 + stxsdx vs37, o8, T1 + xxswapd vs37, vs37 + stxsdx vs38, o16, T1 + xxswapd vs38, vs38 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + xxswapd vs40, vs40 + stxsdx vs41, o8, T1 + xxswapd vs41, vs41 + stxsdx vs42, o16, T1 + xxswapd vs42, vs42 + stxsdx vs43, o24, T1 + xxswapd vs43, vs43 + + addi T1, T1, 32 + + stxsdx vs44, o0, T1 + xxswapd vs44, vs44 + stxsdx vs45, o8, T1 + xxswapd vs45, vs45 + stxsdx vs46, o16, T1 + xxswapd vs46, vs46 + stxsdx vs47, o24, T1 + xxswapd vs47, vs47 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + stxsdx vs34, o16, T2 + stxsdx vs35, o24, T2 + + addi T2, T2, 32 + + stxsdx vs36, o0, T2 + stxsdx vs37, o8, T2 + stxsdx vs38, o16, T2 + stxsdx vs39, o24, T2 + + addi T2, T2, 32 + + stxsdx vs40, o0, T2 + stxsdx vs41, o8, T2 + stxsdx vs42, o16, T2 + stxsdx vs43, o24, T2 + + addi T2, T2, 32 + + stxsdx vs44, o0, T2 + stxsdx vs45, o8, T2 + stxsdx vs46, o16, T2 + stxsdx vs47, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 8x2 +##########################################################################################*/ + +.macro SOLVE_LT_8x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + xxpermdi vs2, vs34, vs35, 0 + xxpermdi vs3, vs34, vs35, 3 + + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs36, vs37, 3 + + xxpermdi vs6, vs38, vs39, 0 + xxpermdi vs7, vs38, vs39, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + lxvd2x vs36, o0, T1 + lxvd2x vs37, o16, T1 + lxvd2x vs38, o32, T1 + lxvd2x vs39, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + xvsubdp vs36, vs36, vs4 + xvsubdp vs37, vs37, vs5 + xvsubdp vs38, vs38, vs6 + xvsubdp vs39, vs39, vs7 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + xvnmsubadp vs34, vs32, vs2 + xvnmsubadp vs35, vs32, vs3 + xvnmsubadp vs36, vs32, vs4 + xvnmsubadp vs37, vs32, vs5 + xvnmsubadp vs38, vs32, vs6 + xvnmsubadp vs39, vs32, vs7 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs33, vs33, vs0 + xvnmsubadp vs34, vs33, vs1 + xvnmsubadp vs35, vs33, vs2 + xvnmsubadp vs36, vs33, vs3 + xvnmsubadp vs37, vs33, vs4 + xvnmsubadp vs38, vs33, vs5 + xvnmsubadp vs39, vs33, vs6 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs34, vs34, vs0 + xvnmsubadp vs35, vs34, vs1 + xvnmsubadp vs36, vs34, vs2 + xvnmsubadp vs37, vs34, vs3 + xvnmsubadp vs38, vs34, vs4 + xvnmsubadp vs39, vs34, vs5 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + lxvdsx vs4, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs35, vs35, vs0 + xvnmsubadp vs36, vs35, vs1 + xvnmsubadp vs37, vs35, vs2 + xvnmsubadp vs38, vs35, vs3 + xvnmsubadp vs39, vs35, vs4 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs36, vs36, vs0 + xvnmsubadp vs37, vs36, vs1 + xvnmsubadp vs38, vs36, vs2 + xvnmsubadp vs39, vs36, vs3 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs37, vs37, vs0 + xvnmsubadp vs38, vs37, vs1 + xvnmsubadp vs39, vs37, vs2 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs38, vs38, vs0 + xvnmsubadp vs39, vs38, vs1 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs39, vs39, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + stxsdx vs34, o16, T1 + xxswapd vs34, vs34 + stxsdx vs35, o24, T1 + xxswapd vs35, vs35 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + xxswapd vs36, vs36 + stxsdx vs37, o8, T1 + xxswapd vs37, vs37 + stxsdx vs38, o16, T1 + xxswapd vs38, vs38 + stxsdx vs39, o24, T1 + xxswapd vs39, vs39 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + stxsdx vs34, o16, T2 + stxsdx vs35, o24, T2 + + addi T2, T2, 32 + + stxsdx vs36, o0, T2 + stxsdx vs37, o8, T2 + stxsdx vs38, o16, T2 + stxsdx vs39, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 4x2 +##########################################################################################*/ + +.macro SOLVE_LT_4x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + xxpermdi vs2, vs34, vs35, 0 + xxpermdi vs3, vs34, vs35, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + lxvd2x vs34, o32, T1 + lxvd2x vs35, o48, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + xvsubdp vs34, vs34, vs2 + xvsubdp vs35, vs35, vs3 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + xvnmsubadp vs34, vs32, vs2 + xvnmsubadp vs35, vs32, vs3 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + + addi T1, T1, 24 + + xvmuldp vs33, vs33, vs0 + xvnmsubadp vs34, vs33, vs1 + xvnmsubadp vs35, vs33, vs2 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs34, vs34, vs0 + xvnmsubadp vs35, vs34, vs1 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs35, vs35, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + stxsdx vs34, o16, T1 + xxswapd vs34, vs34 + stxsdx vs35, o24, T1 + xxswapd vs35, vs35 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + stxsdx vs34, o16, T2 + stxsdx vs35, o24, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 2x2 +##########################################################################################*/ + +.macro SOLVE_LT_2x2 + + xxpermdi vs0, vs32, vs33, 0 + xxpermdi vs1, vs32, vs33, 3 + + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + lxvd2x vs33, o16, T1 + + xvsubdp vs32, vs32, vs0 + xvsubdp vs33, vs33, vs1 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + + addi T1, T1, 16 + + xvmuldp vs32, vs32, vs0 + xvnmsubadp vs33, vs32, vs1 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs33, vs33, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + stxsdx vs33, o8, T1 + xxswapd vs33, vs33 + + stxsdx vs32, o0, T2 + stxsdx vs33, o8, T2 + +.endm + + +/*########################################################################################## + SOLVE_LT 1x2 +##########################################################################################*/ + +.macro SOLVE_LT_1x2 + + xxpermdi vs0, vs32, vs33, 0 + +//############### LOAD B ####################### + + + mr T1, BO + + lxvd2x vs32, o0, T1 + + xvsubdp vs32, vs32, vs0 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxvdsx vs0, o0, T1 + + addi T1, T1, 8 + + xvmuldp vs32, vs32, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxvd2x vs32, o0, T1 + +//############### SAVE C ####################### + + + mr T1, CO + add T2, CO, LDC + + + stxsdx vs32, o0, T1 + xxswapd vs32, vs32 + + stxsdx vs32, o0, T2 + +.endm + + +.macro INIT_16x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + xvmovdp vs40, vs0 + xvmovdp vs41, vs0 + xvmovdp vs42, vs0 + xvmovdp vs43, vs0 + xvmovdp vs44, vs0 + xvmovdp vs45, vs0 + xvmovdp vs46, vs0 + xvmovdp vs47, vs0 + +.endm + + +.macro KERNEL_16x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + lxvdsx vs2, o16, AO + lxvdsx vs3, o24, AO + + addi AO, AO, 32 + + lxvdsx vs4, o0, AO + lxvdsx vs5, o8, AO + lxvdsx vs6, o16, AO + lxvdsx vs7, o24, AO + + addi AO, AO, 32 + + lxvdsx vs8, o0, AO + lxvdsx vs9, o8, AO + lxvdsx vs10, o16, AO + lxvdsx vs11, o24, AO + + addi AO, AO, 32 + + lxvdsx vs12, o0, AO + lxvdsx vs13, o8, AO + lxvdsx vs14, o16, AO + lxvdsx vs15, o24, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + xvmaddadp vs34, vs2, vs16 + xvmaddadp vs35, vs3, vs16 + xvmaddadp vs36, vs4, vs16 + xvmaddadp vs37, vs5, vs16 + xvmaddadp vs38, vs6, vs16 + xvmaddadp vs39, vs7, vs16 + xvmaddadp vs40, vs8, vs16 + xvmaddadp vs41, vs9, vs16 + xvmaddadp vs42, vs10, vs16 + xvmaddadp vs43, vs11, vs16 + xvmaddadp vs44, vs12, vs16 + xvmaddadp vs45, vs13, vs16 + xvmaddadp vs46, vs14, vs16 + xvmaddadp vs47, vs15, vs16 + + +.endm + + +.macro INIT_8x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + xvmovdp vs36, vs0 + xvmovdp vs37, vs0 + xvmovdp vs38, vs0 + xvmovdp vs39, vs0 + +.endm + + +.macro KERNEL_8x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + lxvdsx vs2, o16, AO + lxvdsx vs3, o24, AO + + addi AO, AO, 32 + + lxvdsx vs4, o0, AO + lxvdsx vs5, o8, AO + lxvdsx vs6, o16, AO + lxvdsx vs7, o24, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + xvmaddadp vs34, vs2, vs16 + xvmaddadp vs35, vs3, vs16 + xvmaddadp vs36, vs4, vs16 + xvmaddadp vs37, vs5, vs16 + xvmaddadp vs38, vs6, vs16 + xvmaddadp vs39, vs7, vs16 + + +.endm + + +.macro INIT_4x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + xvmovdp vs34, vs0 + xvmovdp vs35, vs0 + +.endm + + +.macro KERNEL_4x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + lxvdsx vs2, o16, AO + lxvdsx vs3, o24, AO + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + xvmaddadp vs34, vs2, vs16 + xvmaddadp vs35, vs3, vs16 + + +.endm + + +.macro INIT_2x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + xvmovdp vs33, vs0 + +.endm + + +.macro KERNEL_2x1 + + + lxvdsx vs0, o0, AO + lxvdsx vs1, o8, AO + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs1, vs16 + + +.endm + + +.macro INIT_1x1 + + + xxlxor vs0, vs0, vs0 + + xvmovdp vs32, vs0 + +.endm + + +.macro KERNEL_1x1 + + + lxvdsx vs0, o0, AO + + addi AO, AO, 8 + + lxvdsx vs16, o0, BO + + addi BO, BO, 8 + + xvmaddadp vs32, vs0, vs16 + + +.endm + + +/*########################################################################################## + SOLVE_LT 16x1 +##########################################################################################*/ + +.macro SOLVE_LT_16x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + xxswapd vs2, vs34 + xxswapd vs3, vs35 + xxswapd vs4, vs36 + xxswapd vs5, vs37 + xxswapd vs6, vs38 + xxswapd vs7, vs39 + xxswapd vs8, vs40 + xxswapd vs9, vs41 + xxswapd vs10, vs42 + xxswapd vs11, vs43 + xxswapd vs12, vs44 + xxswapd vs13, vs45 + xxswapd vs14, vs46 + xxswapd vs15, vs47 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + lxsdx vs34, o16, T1 + lxsdx vs35, o24, T1 + + addi T1, T1, 32 + + lxsdx vs36, o0, T1 + lxsdx vs37, o8, T1 + lxsdx vs38, o16, T1 + lxsdx vs39, o24, T1 + + addi T1, T1, 32 + + lxsdx vs40, o0, T1 + lxsdx vs41, o8, T1 + lxsdx vs42, o16, T1 + lxsdx vs43, o24, T1 + + addi T1, T1, 32 + + lxsdx vs44, o0, T1 + lxsdx vs45, o8, T1 + lxsdx vs46, o16, T1 + lxsdx vs47, o24, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + xssubdp vs34, vs34, vs2 + xssubdp vs35, vs35, vs3 + xssubdp vs36, vs36, vs4 + xssubdp vs37, vs37, vs5 + xssubdp vs38, vs38, vs6 + xssubdp vs39, vs39, vs7 + xssubdp vs40, vs40, vs8 + xssubdp vs41, vs41, vs9 + xssubdp vs42, vs42, vs10 + xssubdp vs43, vs43, vs11 + xssubdp vs44, vs44, vs12 + xssubdp vs45, vs45, vs13 + xssubdp vs46, vs46, vs14 + xssubdp vs47, vs47, vs15 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + lxsdx vs13, o8, T1 + lxsdx vs14, o16, T1 + lxsdx vs15, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + xsnmsubadp vs34, vs32, vs2 + xsnmsubadp vs35, vs32, vs3 + xsnmsubadp vs36, vs32, vs4 + xsnmsubadp vs37, vs32, vs5 + xsnmsubadp vs38, vs32, vs6 + xsnmsubadp vs39, vs32, vs7 + xsnmsubadp vs40, vs32, vs8 + xsnmsubadp vs41, vs32, vs9 + xsnmsubadp vs42, vs32, vs10 + xsnmsubadp vs43, vs32, vs11 + xsnmsubadp vs44, vs32, vs12 + xsnmsubadp vs45, vs32, vs13 + xsnmsubadp vs46, vs32, vs14 + xsnmsubadp vs47, vs32, vs15 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + lxsdx vs13, o8, T1 + lxsdx vs14, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs33, vs33, vs0 + xsnmsubadp vs34, vs33, vs1 + xsnmsubadp vs35, vs33, vs2 + xsnmsubadp vs36, vs33, vs3 + xsnmsubadp vs37, vs33, vs4 + xsnmsubadp vs38, vs33, vs5 + xsnmsubadp vs39, vs33, vs6 + xsnmsubadp vs40, vs33, vs7 + xsnmsubadp vs41, vs33, vs8 + xsnmsubadp vs42, vs33, vs9 + xsnmsubadp vs43, vs33, vs10 + xsnmsubadp vs44, vs33, vs11 + xsnmsubadp vs45, vs33, vs12 + xsnmsubadp vs46, vs33, vs13 + xsnmsubadp vs47, vs33, vs14 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + lxsdx vs13, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs34, vs34, vs0 + xsnmsubadp vs35, vs34, vs1 + xsnmsubadp vs36, vs34, vs2 + xsnmsubadp vs37, vs34, vs3 + xsnmsubadp vs38, vs34, vs4 + xsnmsubadp vs39, vs34, vs5 + xsnmsubadp vs40, vs34, vs6 + xsnmsubadp vs41, vs34, vs7 + xsnmsubadp vs42, vs34, vs8 + xsnmsubadp vs43, vs34, vs9 + xsnmsubadp vs44, vs34, vs10 + xsnmsubadp vs45, vs34, vs11 + xsnmsubadp vs46, vs34, vs12 + xsnmsubadp vs47, vs34, vs13 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + lxsdx vs12, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs35, vs35, vs0 + xsnmsubadp vs36, vs35, vs1 + xsnmsubadp vs37, vs35, vs2 + xsnmsubadp vs38, vs35, vs3 + xsnmsubadp vs39, vs35, vs4 + xsnmsubadp vs40, vs35, vs5 + xsnmsubadp vs41, vs35, vs6 + xsnmsubadp vs42, vs35, vs7 + xsnmsubadp vs43, vs35, vs8 + xsnmsubadp vs44, vs35, vs9 + xsnmsubadp vs45, vs35, vs10 + xsnmsubadp vs46, vs35, vs11 + xsnmsubadp vs47, vs35, vs12 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + lxsdx vs11, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs36, vs36, vs0 + xsnmsubadp vs37, vs36, vs1 + xsnmsubadp vs38, vs36, vs2 + xsnmsubadp vs39, vs36, vs3 + xsnmsubadp vs40, vs36, vs4 + xsnmsubadp vs41, vs36, vs5 + xsnmsubadp vs42, vs36, vs6 + xsnmsubadp vs43, vs36, vs7 + xsnmsubadp vs44, vs36, vs8 + xsnmsubadp vs45, vs36, vs9 + xsnmsubadp vs46, vs36, vs10 + xsnmsubadp vs47, vs36, vs11 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + lxsdx vs10, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs37, vs37, vs0 + xsnmsubadp vs38, vs37, vs1 + xsnmsubadp vs39, vs37, vs2 + xsnmsubadp vs40, vs37, vs3 + xsnmsubadp vs41, vs37, vs4 + xsnmsubadp vs42, vs37, vs5 + xsnmsubadp vs43, vs37, vs6 + xsnmsubadp vs44, vs37, vs7 + xsnmsubadp vs45, vs37, vs8 + xsnmsubadp vs46, vs37, vs9 + xsnmsubadp vs47, vs37, vs10 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + lxsdx vs9, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs38, vs38, vs0 + xsnmsubadp vs39, vs38, vs1 + xsnmsubadp vs40, vs38, vs2 + xsnmsubadp vs41, vs38, vs3 + xsnmsubadp vs42, vs38, vs4 + xsnmsubadp vs43, vs38, vs5 + xsnmsubadp vs44, vs38, vs6 + xsnmsubadp vs45, vs38, vs7 + xsnmsubadp vs46, vs38, vs8 + xsnmsubadp vs47, vs38, vs9 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + lxsdx vs8, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs39, vs39, vs0 + xsnmsubadp vs40, vs39, vs1 + xsnmsubadp vs41, vs39, vs2 + xsnmsubadp vs42, vs39, vs3 + xsnmsubadp vs43, vs39, vs4 + xsnmsubadp vs44, vs39, vs5 + xsnmsubadp vs45, vs39, vs6 + xsnmsubadp vs46, vs39, vs7 + xsnmsubadp vs47, vs39, vs8 + +//############### OFFSET 8 ####################### + + addi T1, T1, 8*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs40, vs40, vs0 + xsnmsubadp vs41, vs40, vs1 + xsnmsubadp vs42, vs40, vs2 + xsnmsubadp vs43, vs40, vs3 + xsnmsubadp vs44, vs40, vs4 + xsnmsubadp vs45, vs40, vs5 + xsnmsubadp vs46, vs40, vs6 + xsnmsubadp vs47, vs40, vs7 + +//############### OFFSET 9 ####################### + + addi T1, T1, 9*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs41, vs41, vs0 + xsnmsubadp vs42, vs41, vs1 + xsnmsubadp vs43, vs41, vs2 + xsnmsubadp vs44, vs41, vs3 + xsnmsubadp vs45, vs41, vs4 + xsnmsubadp vs46, vs41, vs5 + xsnmsubadp vs47, vs41, vs6 + +//############### OFFSET 10 ####################### + + addi T1, T1, 10*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs42, vs42, vs0 + xsnmsubadp vs43, vs42, vs1 + xsnmsubadp vs44, vs42, vs2 + xsnmsubadp vs45, vs42, vs3 + xsnmsubadp vs46, vs42, vs4 + xsnmsubadp vs47, vs42, vs5 + +//############### OFFSET 11 ####################### + + addi T1, T1, 11*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs43, vs43, vs0 + xsnmsubadp vs44, vs43, vs1 + xsnmsubadp vs45, vs43, vs2 + xsnmsubadp vs46, vs43, vs3 + xsnmsubadp vs47, vs43, vs4 + +//############### OFFSET 12 ####################### + + addi T1, T1, 12*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs44, vs44, vs0 + xsnmsubadp vs45, vs44, vs1 + xsnmsubadp vs46, vs44, vs2 + xsnmsubadp vs47, vs44, vs3 + +//############### OFFSET 13 ####################### + + addi T1, T1, 13*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs45, vs45, vs0 + xsnmsubadp vs46, vs45, vs1 + xsnmsubadp vs47, vs45, vs2 + +//############### OFFSET 14 ####################### + + addi T1, T1, 14*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs46, vs46, vs0 + xsnmsubadp vs47, vs46, vs1 + +//############### OFFSET 15 ####################### + + addi T1, T1, 15*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs47, vs47, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + stxsdx vs41, o8, T1 + stxsdx vs42, o16, T1 + stxsdx vs43, o24, T1 + + addi T1, T1, 32 + + stxsdx vs44, o0, T1 + stxsdx vs45, o8, T1 + stxsdx vs46, o16, T1 + stxsdx vs47, o24, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + + addi T1, T1, 32 + + stxsdx vs40, o0, T1 + stxsdx vs41, o8, T1 + stxsdx vs42, o16, T1 + stxsdx vs43, o24, T1 + + addi T1, T1, 32 + + stxsdx vs44, o0, T1 + stxsdx vs45, o8, T1 + stxsdx vs46, o16, T1 + stxsdx vs47, o24, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 8x1 +##########################################################################################*/ + +.macro SOLVE_LT_8x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + xxswapd vs2, vs34 + xxswapd vs3, vs35 + xxswapd vs4, vs36 + xxswapd vs5, vs37 + xxswapd vs6, vs38 + xxswapd vs7, vs39 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + lxsdx vs34, o16, T1 + lxsdx vs35, o24, T1 + + addi T1, T1, 32 + + lxsdx vs36, o0, T1 + lxsdx vs37, o8, T1 + lxsdx vs38, o16, T1 + lxsdx vs39, o24, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + xssubdp vs34, vs34, vs2 + xssubdp vs35, vs35, vs3 + xssubdp vs36, vs36, vs4 + xssubdp vs37, vs37, vs5 + xssubdp vs38, vs38, vs6 + xssubdp vs39, vs39, vs7 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + lxsdx vs7, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + xsnmsubadp vs34, vs32, vs2 + xsnmsubadp vs35, vs32, vs3 + xsnmsubadp vs36, vs32, vs4 + xsnmsubadp vs37, vs32, vs5 + xsnmsubadp vs38, vs32, vs6 + xsnmsubadp vs39, vs32, vs7 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + lxsdx vs6, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs33, vs33, vs0 + xsnmsubadp vs34, vs33, vs1 + xsnmsubadp vs35, vs33, vs2 + xsnmsubadp vs36, vs33, vs3 + xsnmsubadp vs37, vs33, vs4 + xsnmsubadp vs38, vs33, vs5 + xsnmsubadp vs39, vs33, vs6 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + lxsdx vs5, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs34, vs34, vs0 + xsnmsubadp vs35, vs34, vs1 + xsnmsubadp vs36, vs34, vs2 + xsnmsubadp vs37, vs34, vs3 + xsnmsubadp vs38, vs34, vs4 + xsnmsubadp vs39, vs34, vs5 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + lxsdx vs4, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs35, vs35, vs0 + xsnmsubadp vs36, vs35, vs1 + xsnmsubadp vs37, vs35, vs2 + xsnmsubadp vs38, vs35, vs3 + xsnmsubadp vs39, vs35, vs4 + +//############### OFFSET 4 ####################### + + addi T1, T1, 4*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs36, vs36, vs0 + xsnmsubadp vs37, vs36, vs1 + xsnmsubadp vs38, vs36, vs2 + xsnmsubadp vs39, vs36, vs3 + +//############### OFFSET 5 ####################### + + addi T1, T1, 5*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs37, vs37, vs0 + xsnmsubadp vs38, vs37, vs1 + xsnmsubadp vs39, vs37, vs2 + +//############### OFFSET 6 ####################### + + addi T1, T1, 6*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs38, vs38, vs0 + xsnmsubadp vs39, vs38, vs1 + +//############### OFFSET 7 ####################### + + addi T1, T1, 7*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs39, vs39, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + + addi T1, T1, 32 + + stxsdx vs36, o0, T1 + stxsdx vs37, o8, T1 + stxsdx vs38, o16, T1 + stxsdx vs39, o24, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 4x1 +##########################################################################################*/ + +.macro SOLVE_LT_4x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + xxswapd vs2, vs34 + xxswapd vs3, vs35 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + lxsdx vs34, o16, T1 + lxsdx vs35, o24, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + xssubdp vs34, vs34, vs2 + xssubdp vs35, vs35, vs3 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + lxsdx vs3, o24, T1 + + addi T1, T1, 32 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + xsnmsubadp vs34, vs32, vs2 + xsnmsubadp vs35, vs32, vs3 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + lxsdx vs2, o16, T1 + + addi T1, T1, 24 + + xsmuldp vs33, vs33, vs0 + xsnmsubadp vs34, vs33, vs1 + xsnmsubadp vs35, vs33, vs2 + +//############### OFFSET 2 ####################### + + addi T1, T1, 2*SIZE + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs34, vs34, vs0 + xsnmsubadp vs35, vs34, vs1 + +//############### OFFSET 3 ####################### + + addi T1, T1, 3*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs35, vs35, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + stxsdx vs34, o16, T1 + stxsdx vs35, o24, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 2x1 +##########################################################################################*/ + +.macro SOLVE_LT_2x1 + + xxswapd vs0, vs32 + xxswapd vs1, vs33 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + lxsdx vs33, o8, T1 + + xssubdp vs32, vs32, vs0 + xssubdp vs33, vs33, vs1 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + lxsdx vs1, o8, T1 + + addi T1, T1, 16 + + xsmuldp vs32, vs32, vs0 + xsnmsubadp vs33, vs32, vs1 + +//############### OFFSET 1 ####################### + + addi T1, T1, 1*SIZE + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs33, vs33, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + stxsdx vs33, o8, T1 + +.endm + + +/*########################################################################################## + SOLVE_LT 1x1 +##########################################################################################*/ + +.macro SOLVE_LT_1x1 + + xxswapd vs0, vs32 + +//############### LOAD B ####################### + + + mr T1, BO + + lxsdx vs32, o0, T1 + + xssubdp vs32, vs32, vs0 + + mr T1, AO + + +//############### OFFSET 0 ####################### + + lxsdx vs0, o0, T1 + + addi T1, T1, 8 + + xsmuldp vs32, vs32, vs0 + +//############### SAVE B ####################### + + + mr T1, BO + + + stxsdx vs32, o0, T1 + +//############### SAVE C ####################### + + + mr T1, CO + + stxsdx vs32, o0, T1 + +.endm + From 318cad9c3725860f35ae302a334d4cf4531cfcc6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 22 May 2016 13:51:47 +0200 Subject: [PATCH 40/70] added trsm bencharks for POWER8 to benchmark/Makefile --- benchmark/Makefile | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 38ccb8f44..e78750ec2 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -262,7 +262,8 @@ endif essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ - scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl + scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ + strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ @@ -696,6 +697,9 @@ strsm.mkl : strsm.$(SUFFIX) strsm.veclib : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strsm.essl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrsm #################################################### dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -712,6 +716,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX) dtrsm.veclib : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrsm.essl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrsm #################################################### ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) @@ -729,6 +736,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX) ctrsm.veclib : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrsm.essl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrsm #################################################### ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) @@ -746,6 +756,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX) ztrsm.veclib : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrsm.essl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm From 8b140220c8dd4ac0b93204951486e1ef6d898efa Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 22 May 2016 15:20:04 +0200 Subject: [PATCH 41/70] optimized dtrsm_kernel_LT for POWER8 --- kernel/power/dtrsm_kernel_LT_16x4_power8.S | 1 + kernel/power/dtrsm_logic_LT_16x4_power8.S | 46 +++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S index e1c6249f8..fdfc5ac70 100644 --- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -219,6 +219,7 @@ li o24, 24 li o32, 32 li o48, 48 + li PRE, 384 mr KK, OFFSET diff --git a/kernel/power/dtrsm_logic_LT_16x4_power8.S b/kernel/power/dtrsm_logic_LT_16x4_power8.S index d5d34b422..540a64062 100644 --- a/kernel/power/dtrsm_logic_LT_16x4_power8.S +++ b/kernel/power/dtrsm_logic_LT_16x4_power8.S @@ -18,6 +18,33 @@ DSTRM_LT_L4x16_BEGIN: mr BO, B + li L, -128 + + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + DSTRM_LT_L4x16_LOOP_START: @@ -26,15 +53,30 @@ DSTRM_LT_L4x16_LOOP_START: addic. L, KK, 0 - ble DSTRM_LT_L4x16_SAVE + ble- DSTRM_LT_L4x16_SAVE DSTRM_LT_L4x16_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL_16x4 + addic. L, L, -1 + ble- DSTRM_LT_L4x16_SAVE + + dcbt AO, PRE + KERNEL_16x4 + addic. L, L, -1 + ble- DSTRM_LT_L4x16_SAVE + dcbt AO, PRE KERNEL_16x4 + addic. L, L, -1 + ble- DSTRM_LT_L4x16_SAVE + dcbt AO, PRE + KERNEL_16x4 addic. L, L, -1 - bgt DSTRM_LT_L4x16_LOOP + bgt+ DSTRM_LT_L4x16_LOOP DSTRM_LT_L4x16_SAVE: From 412bcd187abe59d16821afc6787be13faf243c18 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 23 May 2016 11:20:41 +0200 Subject: [PATCH 42/70] optimized dtrsm_logic_LT_16x4_power8.S and dtrsm_macros_LT_16x4_power8.S --- kernel/power/dtrsm_logic_LT_16x4_power8.S | 13 +- kernel/power/dtrsm_macros_LT_16x4_power8.S | 704 +++++++++++---------- 2 files changed, 375 insertions(+), 342 deletions(-) diff --git a/kernel/power/dtrsm_logic_LT_16x4_power8.S b/kernel/power/dtrsm_logic_LT_16x4_power8.S index 540a64062..04f5fdd90 100644 --- a/kernel/power/dtrsm_logic_LT_16x4_power8.S +++ b/kernel/power/dtrsm_logic_LT_16x4_power8.S @@ -54,29 +54,26 @@ DSTRM_LT_L4x16_LOOP_START: addic. L, KK, 0 ble- DSTRM_LT_L4x16_SAVE + mtctr L DSTRM_LT_L4x16_LOOP: dcbt AO, PRE dcbt BO, PRE KERNEL_16x4 - addic. L, L, -1 - ble- DSTRM_LT_L4x16_SAVE + bdz- DSTRM_LT_L4x16_SAVE dcbt AO, PRE KERNEL_16x4 - addic. L, L, -1 - ble- DSTRM_LT_L4x16_SAVE + bdz- DSTRM_LT_L4x16_SAVE dcbt AO, PRE KERNEL_16x4 - addic. L, L, -1 - ble- DSTRM_LT_L4x16_SAVE + bdz- DSTRM_LT_L4x16_SAVE dcbt AO, PRE KERNEL_16x4 - addic. L, L, -1 - bgt+ DSTRM_LT_L4x16_LOOP + bdnz+ DSTRM_LT_L4x16_LOOP DSTRM_LT_L4x16_SAVE: diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S index 14e8402c9..dc47daa3a 100644 --- a/kernel/power/dtrsm_macros_LT_16x4_power8.S +++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S @@ -44,10 +44,17 @@ lxvd2x vs0, o0, AO + + lxvdsx vs16, o0, BO + lxvdsx vs17, o8, BO + lxvdsx vs18, o16, BO + lxvdsx vs19, o24, BO + lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO + addi BO, BO, 32 addi AO, AO, 64 lxvd2x vs4, o0, AO @@ -57,13 +64,6 @@ addi AO, AO, 64 - lxvdsx vs16, o0, BO - lxvdsx vs17, o8, BO - lxvdsx vs18, o16, BO - lxvdsx vs19, o24, BO - - addi BO, BO, 32 - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs0, vs18 @@ -287,52 +287,16 @@ .macro SOLVE_LT_16x4 +//############### LOAD B ####################### + + mr T1, BO + mr T4, BO + xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 xxpermdi vs2, vs32, vs33, 3 xxpermdi vs3, vs34, vs35, 3 - xxpermdi vs4, vs36, vs37, 0 - xxpermdi vs5, vs38, vs39, 0 - xxpermdi vs6, vs36, vs37, 3 - xxpermdi vs7, vs38, vs39, 3 - - xxpermdi vs8, vs40, vs41, 0 - xxpermdi vs9, vs42, vs43, 0 - xxpermdi vs10, vs40, vs41, 3 - xxpermdi vs11, vs42, vs43, 3 - - xxpermdi vs12, vs44, vs45, 0 - xxpermdi vs13, vs46, vs47, 0 - xxpermdi vs14, vs44, vs45, 3 - xxpermdi vs15, vs46, vs47, 3 - - xxpermdi vs16, vs48, vs49, 0 - xxpermdi vs17, vs50, vs51, 0 - xxpermdi vs18, vs48, vs49, 3 - xxpermdi vs19, vs50, vs51, 3 - - xxpermdi vs20, vs52, vs53, 0 - xxpermdi vs21, vs54, vs55, 0 - xxpermdi vs22, vs52, vs53, 3 - xxpermdi vs23, vs54, vs55, 3 - - xxpermdi vs24, vs56, vs57, 0 - xxpermdi vs25, vs58, vs59, 0 - xxpermdi vs26, vs56, vs57, 3 - xxpermdi vs27, vs58, vs59, 3 - - xxpermdi vs28, vs60, vs61, 0 - xxpermdi vs29, vs62, vs63, 0 - xxpermdi vs30, vs60, vs61, 3 - xxpermdi vs31, vs62, vs63, 3 - - -//############### LOAD B ####################### - - - mr T1, BO - lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 @@ -340,6 +304,11 @@ addi T1, T1, 64 + xxpermdi vs4, vs36, vs37, 0 + xxpermdi vs5, vs38, vs39, 0 + xxpermdi vs6, vs36, vs37, 3 + xxpermdi vs7, vs38, vs39, 3 + lxvd2x vs36, o0, T1 lxvd2x vs37, o16, T1 lxvd2x vs38, o32, T1 @@ -347,6 +316,11 @@ addi T1, T1, 64 + xxpermdi vs8, vs40, vs41, 0 + xxpermdi vs9, vs42, vs43, 0 + xxpermdi vs10, vs40, vs41, 3 + xxpermdi vs11, vs42, vs43, 3 + lxvd2x vs40, o0, T1 lxvd2x vs41, o16, T1 lxvd2x vs42, o32, T1 @@ -354,6 +328,11 @@ addi T1, T1, 64 + xxpermdi vs12, vs44, vs45, 0 + xxpermdi vs13, vs46, vs47, 0 + xxpermdi vs14, vs44, vs45, 3 + xxpermdi vs15, vs46, vs47, 3 + lxvd2x vs44, o0, T1 lxvd2x vs45, o16, T1 lxvd2x vs46, o32, T1 @@ -361,6 +340,11 @@ addi T1, T1, 64 + xxpermdi vs16, vs48, vs49, 0 + xxpermdi vs17, vs50, vs51, 0 + xxpermdi vs18, vs48, vs49, 3 + xxpermdi vs19, vs50, vs51, 3 + lxvd2x vs48, o0, T1 lxvd2x vs49, o16, T1 lxvd2x vs50, o32, T1 @@ -368,6 +352,11 @@ addi T1, T1, 64 + xxpermdi vs20, vs52, vs53, 0 + xxpermdi vs21, vs54, vs55, 0 + xxpermdi vs22, vs52, vs53, 3 + xxpermdi vs23, vs54, vs55, 3 + lxvd2x vs52, o0, T1 lxvd2x vs53, o16, T1 lxvd2x vs54, o32, T1 @@ -375,6 +364,11 @@ addi T1, T1, 64 + xxpermdi vs24, vs56, vs57, 0 + xxpermdi vs25, vs58, vs59, 0 + xxpermdi vs26, vs56, vs57, 3 + xxpermdi vs27, vs58, vs59, 3 + lxvd2x vs56, o0, T1 lxvd2x vs57, o16, T1 lxvd2x vs58, o32, T1 @@ -382,76 +376,94 @@ addi T1, T1, 64 + xxpermdi vs28, vs60, vs61, 0 + xxpermdi vs29, vs62, vs63, 0 + xxpermdi vs30, vs60, vs61, 3 + xxpermdi vs31, vs62, vs63, 3 + + + lxvd2x vs60, o0, T1 lxvd2x vs61, o16, T1 lxvd2x vs62, o32, T1 lxvd2x vs63, o48, T1 +//############### OFFSET 0 ####################### + + dcbt AO, PRE + mr T1, AO + xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvsubdp vs36, vs36, vs4 xvsubdp vs37, vs37, vs5 xvsubdp vs38, vs38, vs6 xvsubdp vs39, vs39, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + xvsubdp vs40, vs40, vs8 xvsubdp vs41, vs41, vs9 xvsubdp vs42, vs42, vs10 xvsubdp vs43, vs43, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + xvsubdp vs44, vs44, vs12 xvsubdp vs45, vs45, vs13 xvsubdp vs46, vs46, vs14 xvsubdp vs47, vs47, vs15 + + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 + lxvdsx vs14, o16, T1 + lxvdsx vs15, o24, T1 + + addi T1, T1, 32 + xvsubdp vs48, vs48, vs16 xvsubdp vs49, vs49, vs17 xvsubdp vs50, vs50, vs18 xvsubdp vs51, vs51, vs19 + xvsubdp vs52, vs52, vs20 xvsubdp vs53, vs53, vs21 xvsubdp vs54, vs54, vs22 xvsubdp vs55, vs55, vs23 + xvsubdp vs56, vs56, vs24 xvsubdp vs57, vs57, vs25 xvsubdp vs58, vs58, vs26 xvsubdp vs59, vs59, vs27 + xvsubdp vs60, vs60, vs28 xvsubdp vs61, vs61, vs29 xvsubdp vs62, vs62, vs30 xvsubdp vs63, vs63, vs31 - mr T1, AO - - -//############### OFFSET 0 ####################### - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs4, o0, T1 - lxvdsx vs5, o8, T1 - lxvdsx vs6, o16, T1 - lxvdsx vs7, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs8, o0, T1 - lxvdsx vs9, o8, T1 - lxvdsx vs10, o16, T1 - lxvdsx vs11, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs12, o0, T1 - lxvdsx vs13, o8, T1 - lxvdsx vs14, o16, T1 - lxvdsx vs15, o24, T1 +//############### OFFSET 1 ####################### - addi T1, T1, 32 + addi T1, T1, 1*SIZE xvmuldp vs32, vs32, vs0 xvmuldp vs33, vs33, vs0 @@ -459,9 +471,18 @@ xvnmsubadp vs34, vs32, vs1 xvnmsubadp vs35, vs33, vs1 xvnmsubadp vs36, vs32, vs2 + dcbt T1, PRE xvnmsubadp vs37, vs33, vs2 xvnmsubadp vs38, vs32, vs3 xvnmsubadp vs39, vs33, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs40, vs32, vs4 xvnmsubadp vs41, vs33, vs4 xvnmsubadp vs42, vs32, vs5 @@ -470,6 +491,14 @@ xvnmsubadp vs45, vs33, vs6 xvnmsubadp vs46, vs32, vs7 xvnmsubadp vs47, vs33, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs48, vs32, vs8 xvnmsubadp vs49, vs33, vs8 xvnmsubadp vs50, vs32, vs9 @@ -478,6 +507,14 @@ xvnmsubadp vs53, vs33, vs10 xvnmsubadp vs54, vs32, vs11 xvnmsubadp vs55, vs33, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs56, vs32, vs12 xvnmsubadp vs57, vs33, vs12 xvnmsubadp vs58, vs32, vs13 @@ -487,30 +524,6 @@ xvnmsubadp vs62, vs32, vs15 xvnmsubadp vs63, vs33, vs15 -//############### OFFSET 1 ####################### - - addi T1, T1, 1*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs4, o0, T1 - lxvdsx vs5, o8, T1 - lxvdsx vs6, o16, T1 - lxvdsx vs7, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs8, o0, T1 - lxvdsx vs9, o8, T1 - lxvdsx vs10, o16, T1 - lxvdsx vs11, o24, T1 - - addi T1, T1, 32 lxvdsx vs12, o0, T1 lxvdsx vs13, o8, T1 @@ -518,15 +531,28 @@ addi T1, T1, 24 +//############### OFFSET 2 ####################### + xvmuldp vs34, vs34, vs0 xvmuldp vs35, vs35, vs0 + addi T1, T1, 2*SIZE + xvnmsubadp vs36, vs34, vs1 xvnmsubadp vs37, vs35, vs1 xvnmsubadp vs38, vs34, vs2 + dcbt T1, PRE xvnmsubadp vs39, vs35, vs2 xvnmsubadp vs40, vs34, vs3 xvnmsubadp vs41, vs35, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs42, vs34, vs4 xvnmsubadp vs43, vs35, vs4 xvnmsubadp vs44, vs34, vs5 @@ -535,6 +561,14 @@ xvnmsubadp vs47, vs35, vs6 xvnmsubadp vs48, vs34, vs7 xvnmsubadp vs49, vs35, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs50, vs34, vs8 xvnmsubadp vs51, vs35, vs8 xvnmsubadp vs52, vs34, vs9 @@ -543,6 +577,15 @@ xvnmsubadp vs55, vs35, vs10 xvnmsubadp vs56, vs34, vs11 xvnmsubadp vs57, vs35, vs11 + + lxvdsx vs8, o0, T1 + lxvdsx vs9, o8, T1 + lxvdsx vs10, o16, T1 + lxvdsx vs11, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs58, vs34, vs12 xvnmsubadp vs59, vs35, vs12 xvnmsubadp vs60, vs34, vs13 @@ -550,45 +593,32 @@ xvnmsubadp vs62, vs34, vs14 xvnmsubadp vs63, vs35, vs14 -//############### OFFSET 2 ####################### - - addi T1, T1, 2*SIZE + lxvdsx vs12, o0, T1 + lxvdsx vs13, o8, T1 - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs4, o0, T1 - lxvdsx vs5, o8, T1 - lxvdsx vs6, o16, T1 - lxvdsx vs7, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs8, o0, T1 - lxvdsx vs9, o8, T1 - lxvdsx vs10, o16, T1 - lxvdsx vs11, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs12, o0, T1 - lxvdsx vs13, o8, T1 - - addi T1, T1, 16 + addi T1, T1, 16 +//############### OFFSET 3 ####################### xvmuldp vs36, vs36, vs0 xvmuldp vs37, vs37, vs0 + addi T1, T1, 3*SIZE + xvnmsubadp vs38, vs36, vs1 xvnmsubadp vs39, vs37, vs1 xvnmsubadp vs40, vs36, vs2 + dcbt T1, PRE xvnmsubadp vs41, vs37, vs2 xvnmsubadp vs42, vs36, vs3 xvnmsubadp vs43, vs37, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs44, vs36, vs4 xvnmsubadp vs45, vs37, vs4 xvnmsubadp vs46, vs36, vs5 @@ -597,6 +627,14 @@ xvnmsubadp vs49, vs37, vs6 xvnmsubadp vs50, vs36, vs7 xvnmsubadp vs51, vs37, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs52, vs36, vs8 xvnmsubadp vs53, vs37, vs8 xvnmsubadp vs54, vs36, vs9 @@ -605,28 +643,6 @@ xvnmsubadp vs57, vs37, vs10 xvnmsubadp vs58, vs36, vs11 xvnmsubadp vs59, vs37, vs11 - xvnmsubadp vs60, vs36, vs12 - xvnmsubadp vs61, vs37, vs12 - xvnmsubadp vs62, vs36, vs13 - xvnmsubadp vs63, vs37, vs13 - -//############### OFFSET 3 ####################### - - addi T1, T1, 3*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs4, o0, T1 - lxvdsx vs5, o8, T1 - lxvdsx vs6, o16, T1 - lxvdsx vs7, o24, T1 - - addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 @@ -635,19 +651,43 @@ addi T1, T1, 32 + xvnmsubadp vs60, vs36, vs12 + xvnmsubadp vs61, vs37, vs12 + xvnmsubadp vs62, vs36, vs13 + xvnmsubadp vs63, vs37, vs13 + lxvdsx vs12, o0, T1 + stxvd2x vs32, o0, T4 + stxvd2x vs33, o16, T4 + stxvd2x vs34, o32, T4 + stxvd2x vs35, o48, T4 + + addi T4, T4, 64 + addi T1, T1, 8 +//############### OFFSET 4 ####################### xvmuldp vs38, vs38, vs0 xvmuldp vs39, vs39, vs0 + addi T1, T1, 4*SIZE + xvnmsubadp vs40, vs38, vs1 xvnmsubadp vs41, vs39, vs1 xvnmsubadp vs42, vs38, vs2 + dcbt T1, PRE xvnmsubadp vs43, vs39, vs2 xvnmsubadp vs44, vs38, vs3 xvnmsubadp vs45, vs39, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs46, vs38, vs4 xvnmsubadp vs47, vs39, vs4 xvnmsubadp vs48, vs38, vs5 @@ -656,6 +696,15 @@ xvnmsubadp vs51, vs39, vs6 xvnmsubadp vs52, vs38, vs7 xvnmsubadp vs53, vs39, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + + xvnmsubadp vs54, vs38, vs8 xvnmsubadp vs55, vs39, vs8 xvnmsubadp vs56, vs38, vs9 @@ -664,26 +713,6 @@ xvnmsubadp vs59, vs39, vs10 xvnmsubadp vs60, vs38, vs11 xvnmsubadp vs61, vs39, vs11 - xvnmsubadp vs62, vs38, vs12 - xvnmsubadp vs63, vs39, vs12 - -//############### OFFSET 4 ####################### - - addi T1, T1, 4*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs4, o0, T1 - lxvdsx vs5, o8, T1 - lxvdsx vs6, o16, T1 - lxvdsx vs7, o24, T1 - - addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 @@ -692,15 +721,31 @@ addi T1, T1, 32 + xvnmsubadp vs62, vs38, vs12 + xvnmsubadp vs63, vs39, vs12 + + +//############### OFFSET 5 ####################### xvmuldp vs40, vs40, vs0 xvmuldp vs41, vs41, vs0 + addi T1, T1, 5*SIZE + xvnmsubadp vs42, vs40, vs1 xvnmsubadp vs43, vs41, vs1 xvnmsubadp vs44, vs40, vs2 + dcbt T1, PRE xvnmsubadp vs45, vs41, vs2 xvnmsubadp vs46, vs40, vs3 xvnmsubadp vs47, vs41, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs48, vs40, vs4 xvnmsubadp vs49, vs41, vs4 xvnmsubadp vs50, vs40, vs5 @@ -709,6 +754,14 @@ xvnmsubadp vs53, vs41, vs6 xvnmsubadp vs54, vs40, vs7 xvnmsubadp vs55, vs41, vs7 + + lxvdsx vs4, o0, T1 + lxvdsx vs5, o8, T1 + lxvdsx vs6, o16, T1 + lxvdsx vs7, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs56, vs40, vs8 xvnmsubadp vs57, vs41, vs8 xvnmsubadp vs58, vs40, vs9 @@ -718,23 +771,6 @@ xvnmsubadp vs62, vs40, vs11 xvnmsubadp vs63, vs41, vs11 -//############### OFFSET 5 ####################### - - addi T1, T1, 5*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 - - lxvdsx vs4, o0, T1 - lxvdsx vs5, o8, T1 - lxvdsx vs6, o16, T1 - lxvdsx vs7, o24, T1 - - addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 @@ -742,15 +778,27 @@ addi T1, T1, 24 +//############### OFFSET 6 ####################### xvmuldp vs42, vs42, vs0 xvmuldp vs43, vs43, vs0 + addi T1, T1, 6*SIZE + xvnmsubadp vs44, vs42, vs1 xvnmsubadp vs45, vs43, vs1 xvnmsubadp vs46, vs42, vs2 + dcbt T1, PRE xvnmsubadp vs47, vs43, vs2 xvnmsubadp vs48, vs42, vs3 xvnmsubadp vs49, vs43, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs50, vs42, vs4 xvnmsubadp vs51, vs43, vs4 xvnmsubadp vs52, vs42, vs5 @@ -759,23 +807,6 @@ xvnmsubadp vs55, vs43, vs6 xvnmsubadp vs56, vs42, vs7 xvnmsubadp vs57, vs43, vs7 - xvnmsubadp vs58, vs42, vs8 - xvnmsubadp vs59, vs43, vs8 - xvnmsubadp vs60, vs42, vs9 - xvnmsubadp vs61, vs43, vs9 - xvnmsubadp vs62, vs42, vs10 - xvnmsubadp vs63, vs43, vs10 - -//############### OFFSET 6 ####################### - - addi T1, T1, 6*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 @@ -784,20 +815,46 @@ addi T1, T1, 32 + xvnmsubadp vs58, vs42, vs8 + xvnmsubadp vs59, vs43, vs8 + xvnmsubadp vs60, vs42, vs9 + xvnmsubadp vs61, vs43, vs9 + xvnmsubadp vs62, vs42, vs10 + xvnmsubadp vs63, vs43, vs10 + lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 addi T1, T1, 16 + stxvd2x vs36, o0, T4 + stxvd2x vs37, o16, T4 + stxvd2x vs38, o32, T4 + stxvd2x vs39, o48, T4 + + addi T4, T4, 64 + +//############### OFFSET 7 ####################### xvmuldp vs44, vs44, vs0 xvmuldp vs45, vs45, vs0 + addi T1, T1, 7*SIZE + xvnmsubadp vs46, vs44, vs1 xvnmsubadp vs47, vs45, vs1 xvnmsubadp vs48, vs44, vs2 + dcbt T1, PRE xvnmsubadp vs49, vs45, vs2 xvnmsubadp vs50, vs44, vs3 xvnmsubadp vs51, vs45, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs52, vs44, vs4 xvnmsubadp vs53, vs45, vs4 xvnmsubadp vs54, vs44, vs5 @@ -806,21 +863,6 @@ xvnmsubadp vs57, vs45, vs6 xvnmsubadp vs58, vs44, vs7 xvnmsubadp vs59, vs45, vs7 - xvnmsubadp vs60, vs44, vs8 - xvnmsubadp vs61, vs45, vs8 - xvnmsubadp vs62, vs44, vs9 - xvnmsubadp vs63, vs45, vs9 - -//############### OFFSET 7 ####################### - - addi T1, T1, 7*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 @@ -829,19 +871,36 @@ addi T1, T1, 32 + xvnmsubadp vs60, vs44, vs8 + xvnmsubadp vs61, vs45, vs8 + xvnmsubadp vs62, vs44, vs9 + xvnmsubadp vs63, vs45, vs9 + lxvdsx vs8, o0, T1 addi T1, T1, 8 +//############### OFFSET 8 ####################### xvmuldp vs46, vs46, vs0 xvmuldp vs47, vs47, vs0 + addi T1, T1, 8*SIZE + xvnmsubadp vs48, vs46, vs1 xvnmsubadp vs49, vs47, vs1 xvnmsubadp vs50, vs46, vs2 + dcbt T1, PRE xvnmsubadp vs51, vs47, vs2 xvnmsubadp vs52, vs46, vs3 xvnmsubadp vs53, vs47, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs54, vs46, vs4 xvnmsubadp vs55, vs47, vs4 xvnmsubadp vs56, vs46, vs5 @@ -850,19 +909,6 @@ xvnmsubadp vs59, vs47, vs6 xvnmsubadp vs60, vs46, vs7 xvnmsubadp vs61, vs47, vs7 - xvnmsubadp vs62, vs46, vs8 - xvnmsubadp vs63, vs47, vs8 - -//############### OFFSET 8 ####################### - - addi T1, T1, 8*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 @@ -871,15 +917,38 @@ addi T1, T1, 32 + stxvd2x vs40, o0, T4 + stxvd2x vs41, o16, T4 + stxvd2x vs42, o32, T4 + stxvd2x vs43, o48, T4 + + addi T4, T4, 64 + + xvnmsubadp vs62, vs46, vs8 + xvnmsubadp vs63, vs47, vs8 + + +//############### OFFSET 9 ####################### xvmuldp vs48, vs48, vs0 xvmuldp vs49, vs49, vs0 + addi T1, T1, 9*SIZE + xvnmsubadp vs50, vs48, vs1 xvnmsubadp vs51, vs49, vs1 xvnmsubadp vs52, vs48, vs2 + dcbt T1, PRE xvnmsubadp vs53, vs49, vs2 xvnmsubadp vs54, vs48, vs3 xvnmsubadp vs55, vs49, vs3 + + lxvdsx vs0, o0, T1 + lxvdsx vs1, o8, T1 + lxvdsx vs2, o16, T1 + lxvdsx vs3, o24, T1 + + addi T1, T1, 32 + xvnmsubadp vs56, vs48, vs4 xvnmsubadp vs57, vs49, vs4 xvnmsubadp vs58, vs48, vs5 @@ -889,42 +958,25 @@ xvnmsubadp vs62, vs48, vs7 xvnmsubadp vs63, vs49, vs7 -//############### OFFSET 9 ####################### - - addi T1, T1, 9*SIZE - - lxvdsx vs0, o0, T1 - lxvdsx vs1, o8, T1 - lxvdsx vs2, o16, T1 - lxvdsx vs3, o24, T1 - - addi T1, T1, 32 - lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 addi T1, T1, 24 +//############### OFFSET 10 ####################### xvmuldp vs50, vs50, vs0 xvmuldp vs51, vs51, vs0 + addi T1, T1, 10*SIZE + xvnmsubadp vs52, vs50, vs1 xvnmsubadp vs53, vs51, vs1 xvnmsubadp vs54, vs50, vs2 + dcbt T1, PRE xvnmsubadp vs55, vs51, vs2 xvnmsubadp vs56, vs50, vs3 xvnmsubadp vs57, vs51, vs3 - xvnmsubadp vs58, vs50, vs4 - xvnmsubadp vs59, vs51, vs4 - xvnmsubadp vs60, vs50, vs5 - xvnmsubadp vs61, vs51, vs5 - xvnmsubadp vs62, vs50, vs6 - xvnmsubadp vs63, vs51, vs6 - -//############### OFFSET 10 ####################### - - addi T1, T1, 10*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 @@ -933,28 +985,38 @@ addi T1, T1, 32 + xvnmsubadp vs58, vs50, vs4 + xvnmsubadp vs59, vs51, vs4 + xvnmsubadp vs60, vs50, vs5 + xvnmsubadp vs61, vs51, vs5 + xvnmsubadp vs62, vs50, vs6 + xvnmsubadp vs63, vs51, vs6 + lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 addi T1, T1, 16 + stxvd2x vs44, o0, T4 + stxvd2x vs45, o16, T4 + stxvd2x vs46, o32, T4 + stxvd2x vs47, o48, T4 + + addi T4, T4, 64 + +//############### OFFSET 11 ####################### xvmuldp vs52, vs52, vs0 xvmuldp vs53, vs53, vs0 + addi T1, T1, 11*SIZE + xvnmsubadp vs54, vs52, vs1 xvnmsubadp vs55, vs53, vs1 xvnmsubadp vs56, vs52, vs2 + dcbt T1, PRE xvnmsubadp vs57, vs53, vs2 xvnmsubadp vs58, vs52, vs3 xvnmsubadp vs59, vs53, vs3 - xvnmsubadp vs60, vs52, vs4 - xvnmsubadp vs61, vs53, vs4 - xvnmsubadp vs62, vs52, vs5 - xvnmsubadp vs63, vs53, vs5 - -//############### OFFSET 11 ####################### - - addi T1, T1, 11*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 @@ -963,25 +1025,28 @@ addi T1, T1, 32 + xvnmsubadp vs60, vs52, vs4 + xvnmsubadp vs61, vs53, vs4 + xvnmsubadp vs62, vs52, vs5 + xvnmsubadp vs63, vs53, vs5 + lxvdsx vs4, o0, T1 addi T1, T1, 8 +//############### OFFSET 12 ####################### xvmuldp vs54, vs54, vs0 xvmuldp vs55, vs55, vs0 + addi T1, T1, 12*SIZE + xvnmsubadp vs56, vs54, vs1 xvnmsubadp vs57, vs55, vs1 xvnmsubadp vs58, vs54, vs2 + dcbt T1, PRE xvnmsubadp vs59, vs55, vs2 xvnmsubadp vs60, vs54, vs3 xvnmsubadp vs61, vs55, vs3 - xvnmsubadp vs62, vs54, vs4 - xvnmsubadp vs63, vs55, vs4 - -//############### OFFSET 12 ####################### - - addi T1, T1, 12*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 @@ -990,9 +1055,23 @@ addi T1, T1, 32 + stxvd2x vs48, o0, T4 + stxvd2x vs49, o16, T4 + stxvd2x vs50, o32, T4 + stxvd2x vs51, o48, T4 + + addi T4, T4, 64 + + xvnmsubadp vs62, vs54, vs4 + xvnmsubadp vs63, vs55, vs4 + + +//############### OFFSET 13 ####################### xvmuldp vs56, vs56, vs0 xvmuldp vs57, vs57, vs0 + addi T1, T1, 13*SIZE + xvnmsubadp vs58, vs56, vs1 xvnmsubadp vs59, vs57, vs1 xvnmsubadp vs60, vs56, vs2 @@ -1000,43 +1079,44 @@ xvnmsubadp vs62, vs56, vs3 xvnmsubadp vs63, vs57, vs3 -//############### OFFSET 13 ####################### - - addi T1, T1, 13*SIZE - lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 addi T1, T1, 24 +//############### OFFSET 14 ####################### xvmuldp vs58, vs58, vs0 xvmuldp vs59, vs59, vs0 + addi T1, T1, 14*SIZE + xvnmsubadp vs60, vs58, vs1 xvnmsubadp vs61, vs59, vs1 xvnmsubadp vs62, vs58, vs2 xvnmsubadp vs63, vs59, vs2 -//############### OFFSET 14 ####################### - - addi T1, T1, 14*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 + stxvd2x vs52, o0, T4 + stxvd2x vs53, o16, T4 + stxvd2x vs54, o32, T4 + stxvd2x vs55, o48, T4 + + addi T4, T4, 64 +//############### OFFSET 15 ####################### xvmuldp vs60, vs60, vs0 xvmuldp vs61, vs61, vs0 + addi T1, T1, 15*SIZE + xvnmsubadp vs62, vs60, vs1 xvnmsubadp vs63, vs61, vs1 -//############### OFFSET 15 ####################### - - addi T1, T1, 15*SIZE - lxvdsx vs0, o0, T1 addi T1, T1, 8 @@ -1048,62 +1128,18 @@ //############### SAVE B ####################### - mr T1, BO - - - stxvd2x vs32, o0, T1 - stxvd2x vs33, o16, T1 - stxvd2x vs34, o32, T1 - stxvd2x vs35, o48, T1 - - addi T1, T1, 64 - stxvd2x vs36, o0, T1 - stxvd2x vs37, o16, T1 - stxvd2x vs38, o32, T1 - stxvd2x vs39, o48, T1 + stxvd2x vs56, o0, T4 + stxvd2x vs57, o16, T4 + stxvd2x vs58, o32, T4 + stxvd2x vs59, o48, T4 - addi T1, T1, 64 - - stxvd2x vs40, o0, T1 - stxvd2x vs41, o16, T1 - stxvd2x vs42, o32, T1 - stxvd2x vs43, o48, T1 - - addi T1, T1, 64 - - stxvd2x vs44, o0, T1 - stxvd2x vs45, o16, T1 - stxvd2x vs46, o32, T1 - stxvd2x vs47, o48, T1 - - addi T1, T1, 64 - - stxvd2x vs48, o0, T1 - stxvd2x vs49, o16, T1 - stxvd2x vs50, o32, T1 - stxvd2x vs51, o48, T1 - - addi T1, T1, 64 - - stxvd2x vs52, o0, T1 - stxvd2x vs53, o16, T1 - stxvd2x vs54, o32, T1 - stxvd2x vs55, o48, T1 - - addi T1, T1, 64 - - stxvd2x vs56, o0, T1 - stxvd2x vs57, o16, T1 - stxvd2x vs58, o32, T1 - stxvd2x vs59, o48, T1 - - addi T1, T1, 64 + addi T4, T4, 64 - stxvd2x vs60, o0, T1 - stxvd2x vs61, o16, T1 - stxvd2x vs62, o32, T1 - stxvd2x vs63, o48, T1 + stxvd2x vs60, o0, T4 + stxvd2x vs61, o16, T4 + stxvd2x vs62, o32, T4 + stxvd2x vs63, o48, T4 //############### SAVE C ####################### From fca66262c4896a4dc92f93470f14e80b3c1e3648 Mon Sep 17 00:00:00 2001 From: Aleksey Kuleshov Date: Mon, 23 May 2016 13:24:15 +0300 Subject: [PATCH 43/70] mips64/axpy: fix error when INCY == 0 --- kernel/mips64/axpy.S | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/mips64/axpy.S b/kernel/mips64/axpy.S index 32694a99d..5d9728a48 100644 --- a/kernel/mips64/axpy.S +++ b/kernel/mips64/axpy.S @@ -225,7 +225,9 @@ .align 3 .L20: + beqz INCY, .L27 dsra I, N, 3 + move YY, Y blez I, .L25 @@ -405,5 +407,19 @@ j $31 NOP + .align 3 + +.L27: + LD b1, 0 * SIZE(Y) + +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) EPILOGUE From 053044ae4dcb3b6c84f6ff049f19de9160b63465 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Wed, 25 May 2016 09:13:28 +0200 Subject: [PATCH 44/70] Replace CMAKE_SOURCE_DIR/CMAKE_BINARY_DIR with PROJECT_SOURCE_DIR/PROJECT_BINARY_DIR If OpenBLAS is built using add_subdirectory(OpenBlas) as part of another project then the paths set by CMAKE_XXX_DIR are relative to the parent project and not the OpenBLAS project. --- CMakeLists.txt | 10 +++++----- cmake/export.cmake | 2 +- cmake/prebuild.cmake | 14 ++++++------- cmake/system.cmake | 20 +++++++++---------- ctest/CMakeLists.txt | 2 +- driver/level2/CMakeLists.txt | 2 +- driver/level3/CMakeLists.txt | 2 +- driver/others/CMakeLists.txt | 2 +- interface/CMakeLists.txt | 2 +- kernel/CMakeLists.txt | 4 ++-- .../CMAKE/CheckFortranTypeSizes.cmake | 4 ++-- lapack-netlib/CMAKE/CheckTimeFunction.cmake | 4 ++-- lapack-netlib/CMAKE/FortranMangling.cmake | 8 ++++---- lapack/CMakeLists.txt | 2 +- test/CMakeLists.txt | 2 +- utest/CMakeLists.txt | 2 +- 16 files changed, 41 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f5dfb8187..d96140232 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,8 +45,8 @@ endif() message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") -include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") -include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") set(BLASDIRS interface driver/level2 driver/level3 driver/others) @@ -123,9 +123,9 @@ endforeach () # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. if (NOT NOFORTRAN AND NOT NO_LAPACK) - include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake") if (NOT NO_LAPACKE) - include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake") endif () endif () @@ -137,7 +137,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") # Set output for libopenblas set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) diff --git a/cmake/export.cmake b/cmake/export.cmake index adf59101f..629f8fbc2 100644 --- a/cmake/export.cmake +++ b/cmake/export.cmake @@ -53,7 +53,7 @@ endif() add_custom_command( TARGET ${OpenBLAS_LIBNAME} PRE_LINK COMMAND perl - ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" + ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" COMMENT "Create openblas.def file" VERBATIM) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index c3fa48655..471ce90e4 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -50,20 +50,20 @@ else() set(TARGET_CONF "config.h") endif () -include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") if (NOT NOFORTRAN) - include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") endif () # compile getarch set(GETARCH_SRC - ${CMAKE_SOURCE_DIR}/getarch.c + ${PROJECT_SOURCE_DIR}/getarch.c ${CPUIDEMO} ) if (NOT MSVC) - list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S) + list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) endif () if (MSVC) @@ -76,7 +76,7 @@ set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) @@ -97,8 +97,8 @@ set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) try_compile(GETARCH2_RESULT ${GETARCH2_DIR} - SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) diff --git a/cmake/system.cmake b/cmake/system.cmake index 134e9c12d..aa046a56a 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -3,7 +3,7 @@ ## Description: Ported from OpenBLAS/Makefile.system ## -set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib") +set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib") # TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa # http://stackoverflow.com/questions/714100/os-detecting-makefile @@ -78,7 +78,7 @@ else () set(ONLY_CBLAS 0) endif () -include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (NOT DEFINED NUM_THREADS) set(NUM_THREADS ${NUM_CORES}) @@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy") set(OBJCONV "${CROSS_SUFFIX}objconv") # OS dependent settings -include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/os.cmake") # Architecture dependent settings -include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings -include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") +include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings - include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") endif () if (BINARY64) @@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX) set(SYMBOLSUFFIX "") endif () -set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") +set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") # TODO: nead to convert these Makefiles -# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake +# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") @@ -410,8 +410,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def") set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") -set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}") -set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}") +set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}") +set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}") set(LIB_COMPONENTS BLAS) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index dbe785bcb..addcffeac 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) enable_language(Fortran) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 696767486..f444469bd 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -1,5 +1,5 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) # sources that need to be compiled twice, once with no flags and once with LOWER set(UL_SOURCES diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 3d3303af2..36677a942 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index b361f2a97..489d40c76 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) if (${CORE} STREQUAL "PPC440") set(MEMORY memory_qalloc.c) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 9ff924e5f..1722dc661 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -1,5 +1,5 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) set(BLAS1_SOURCES diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index fc4c4028b..17c2b1b89 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1,6 +1,6 @@ -include_directories(${CMAKE_SOURCE_DIR}) -include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake") +include_directories(${PROJECT_SOURCE_DIR}) +include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake") # Makefile diff --git a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake index 9cc12ce17..1f410e310 100644 --- a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake +++ b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake @@ -18,7 +18,7 @@ macro( _CHECK_FORTRAN_TYPE_SIZE _TYPE_NAME _TEST_SIZES ) foreach( __TEST_SIZE ${_TEST_SIZES} ) - set( __TEST_FILE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/testFortran${_TYPE_NAME}Size${__TEST_SIZE}.f90 ) + set( __TEST_FILE ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/testFortran${_TYPE_NAME}Size${__TEST_SIZE}.f90 ) file( WRITE ${__TEST_FILE} " PROGRAM check_size @@ -27,7 +27,7 @@ macro( _CHECK_FORTRAN_TYPE_SIZE _TYPE_NAME _TEST_SIZES ) pa => a END PROGRAM ") - try_compile( SIZEOF_${_TYPE_NAME} ${CMAKE_BINARY_DIR} ${__TEST_FILE} ) + try_compile( SIZEOF_${_TYPE_NAME} ${PROJECT_BINARY_DIR} ${__TEST_FILE} ) if( SIZEOF_${_TYPE_NAME} ) message( STATUS "Testing default ${_TYPE_NAME}*${__TEST_SIZE} - found" ) set( SIZEOF_${_TYPE_NAME} ${__TEST_SIZE} CACHE INTERNAL "Size of the default ${_TYPE_NAME} type" FORCE ) diff --git a/lapack-netlib/CMAKE/CheckTimeFunction.cmake b/lapack-netlib/CMAKE/CheckTimeFunction.cmake index 350a59132..1a65f242b 100644 --- a/lapack-netlib/CMAKE/CheckTimeFunction.cmake +++ b/lapack-netlib/CMAKE/CheckTimeFunction.cmake @@ -16,11 +16,11 @@ macro(CHECK_TIME_FUNCTION FUNCTION VARIABLE) if(RES) set(${VARIABLE} ${FUNCTION} CACHE INTERNAL "Have Fortran function ${FUNCTION}") message(STATUS "Looking for Fortran ${FUNCTION} - found") - file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log + file(APPEND ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log "Fortran ${FUNCTION} exists. ${OUTPUT} \n\n") else(RES) message(STATUS "Looking for Fortran ${FUNCTION} - not found") - file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log + file(APPEND ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log "Fortran ${FUNCTION} does not exist. \n ${OUTPUT} \n") endif(RES) endmacro(CHECK_TIME_FUNCTION) diff --git a/lapack-netlib/CMAKE/FortranMangling.cmake b/lapack-netlib/CMAKE/FortranMangling.cmake index 98b8443ef..538c80218 100644 --- a/lapack-netlib/CMAKE/FortranMangling.cmake +++ b/lapack-netlib/CMAKE/FortranMangling.cmake @@ -43,7 +43,7 @@ MESSAGE(STATUS "Testing FORTRAN_MANGLING") MESSAGE(STATUS "Compiling Finface.f...") execute_process ( COMMAND ${CMAKE_Fortran_COMPILER} ${F77_OPTION_COMPILE} ${PROJECT_SOURCE_DIR}/lapacke/mangling/Fintface.f - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp OUTPUT_VARIABLE OUTPUT RESULT_VARIABLE RESULT ERROR_VARIABLE ERROR) @@ -58,7 +58,7 @@ MESSAGE(STATUS "Compiling Finface.f...") MESSAGE(STATUS "Compiling Cintface.c...") execute_process ( COMMAND ${CMAKE_C_COMPILER} ${F77_OPTION_COMPILE} ${PROJECT_SOURCE_DIR}/lapacke/mangling/Cintface.c - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp OUTPUT_VARIABLE OUTPUT RESULT_VARIABLE RESULT ERROR_VARIABLE ERROR) @@ -73,7 +73,7 @@ MESSAGE(STATUS "Compiling Cintface.c...") MESSAGE(STATUS "Linking Finface.f and Cintface.c...") execute_process ( COMMAND ${CMAKE_Fortran_COMPILER} ${F77_OUTPUT_OBJ} xintface.exe Fintface.o Cintface.o - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp OUTPUT_VARIABLE OUTPUT RESULT_VARIABLE RESULT ERROR_VARIABLE ERROR) @@ -88,7 +88,7 @@ MESSAGE(STATUS "Linking Finface.f and Cintface.c...") MESSAGE(STATUS "Running ./xintface...") execute_process ( COMMAND ./xintface.exe - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp RESULT_VARIABLE xintface_RES OUTPUT_VARIABLE xintface_OUT ERROR_VARIABLE xintface_ERR) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index de42e1ab6..afd583c11 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,5 +1,5 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) set(LAPACK_SOURCES diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cd4497117..5e9baf928 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) enable_language(Fortran) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dfa42df67..f0ffee088 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) set(OpenBLAS_utest_src utest_main.c From c8a7860eb3ea70e4684d6ab82c2c2a432b33187d Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Mon, 30 May 2016 21:17:00 +0530 Subject: [PATCH 45/70] STRSM optimized Signed-off-by: Kaustubh Raste --- kernel/Makefile.L3 | 4 - kernel/mips/macros_msa.h | 6 +- kernel/mips/strsm_kernel_LN_8x8_msa.c | 997 +++++++-------------- kernel/mips/strsm_kernel_LT_8x8_msa.c | 985 ++++++--------------- kernel/mips/strsm_kernel_RN_8x8_msa.c | 1152 ++++++++---------------- kernel/mips/strsm_kernel_RT_8x8_msa.c | 1182 +++++++++---------------- 6 files changed, 1362 insertions(+), 2964 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8e6827424..e55f153f5 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif -ifeq ($(ARCH), MIPS) -USE_GEMM3M = 1 -endif - ifeq ($(ARCH), arm) USE_TRMM = 1 endif diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ae85220c6..0efca7860 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -42,6 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) +#define COPY_FLOAT_TO_VECTOR(a, b) \ + b = __msa_cast_to_vector_float(a); \ + b = (v4f32) __msa_splati_w((v4i32) b, 0); + + /* Description : Load 2 vectors of single precision floating point elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -178,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ } - #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) #endif /* __MACROS_MSA_H__ */ diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c index 3db7da3c4..516b9752f 100644 --- a/kernel/mips/strsm_kernel_LN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -30,6 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; @@ -47,107 +50,43 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; aa += 8; bb += 8; - - for (k = (bk - 1); k--;) - { - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - - aa += 8; - bb += 8; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); } a -= 64; @@ -169,25 +108,18 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c7 *= src_a63; res_c15 *= src_a63; - res_c6 -= res_c7 * src_a62; res_c14 -= res_c15 * src_a62; - res_c5 -= res_c7 * src_a61; res_c13 -= res_c15 * src_a61; - res_c4 -= res_c7 * src_a60; res_c12 -= res_c15 * src_a60; - res_c3 -= res_c7 * src_a59; res_c11 -= res_c15 * src_a59; - res_c2 -= res_c7 * src_a58; res_c10 -= res_c15 * src_a58; - res_c1 -= res_c7 * src_a57; res_c9 -= res_c15 * src_a57; - res_c0 -= res_c7 * src_a56; res_c8 -= res_c15 * src_a56; @@ -200,22 +132,16 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 *= src_a54; res_c14 *= src_a54; - res_c5 -= res_c6 * src_a53; res_c13 -= res_c14 * src_a53; - res_c4 -= res_c6 * src_a52; res_c12 -= res_c14 * src_a52; - res_c3 -= res_c6 * src_a51; res_c11 -= res_c14 * src_a51; - res_c2 -= res_c6 * src_a50; res_c10 -= res_c14 * src_a50; - res_c1 -= res_c6 * src_a49; res_c9 -= res_c14 * src_a49; - res_c0 -= res_c6 * src_a48; res_c8 -= res_c14 * src_a48; @@ -227,39 +153,29 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c5 *= src_a45; res_c13 *= src_a45; - res_c4 -= res_c5 * src_a44; res_c12 -= res_c13 * src_a44; - res_c3 -= res_c5 * src_a43; res_c11 -= res_c13 * src_a43; - res_c2 -= res_c5 * src_a42; res_c10 -= res_c13 * src_a42; - res_c1 -= res_c5 * src_a41; res_c9 -= res_c13 * src_a41; - res_c0 -= res_c5 * src_a40; res_c8 -= res_c13 * src_a40; src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - src_a36 = __msa_cast_to_vector_float(*(a + 36)); - src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); res_c4 *= src_a36; res_c12 *= src_a36; - res_c3 -= res_c4 * src_a35; res_c11 -= res_c12 * src_a35; - res_c2 -= res_c4 * src_a34; res_c10 -= res_c12 * src_a34; - res_c1 -= res_c4 * src_a33; res_c9 -= res_c12 * src_a33; - res_c0 -= res_c4 * src_a32; res_c8 -= res_c12 * src_a32; @@ -285,13 +201,10 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c3 *= src_a27; res_c11 *= src_a27; - res_c2 -= res_c3 * src_a26; res_c10 -= res_c11 * src_a26; - res_c1 -= res_c3 * src_a25; res_c9 -= res_c11 * src_a25; - res_c0 -= res_c3 * src_a24; res_c8 -= res_c11 * src_a24; @@ -302,23 +215,17 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 *= src_a18; res_c10 *= src_a18; - res_c1 -= res_c2 * src_a17; res_c9 -= res_c10 * src_a17; - res_c0 -= res_c2 * src_a16; res_c8 -= res_c10 * src_a16; - src_a9 = __msa_cast_to_vector_float(*(a + 9)); - src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); - src_a8 = __msa_cast_to_vector_float(*(a + 8)); - src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); + COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c1 *= src_a9; res_c9 *= src_a9; - res_c0 -= res_c1 * src_a8; res_c8 -= res_c9 * src_a8; @@ -345,6 +252,9 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; @@ -356,65 +266,60 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; - - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - } + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; + aa += 8; + bb += 4; } - else + + if (bk & 1) { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; } a -= 64; @@ -469,8 +374,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - src_a36 = __msa_cast_to_vector_float(*(a + 36)); - src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -495,12 +399,9 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c1 -= res_c2 * src_a17; res_c0 -= res_c2 * src_a16; - src_a9 = __msa_cast_to_vector_float(*(a + 9)); - src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); - src_a8 = __msa_cast_to_vector_float(*(a + 8)); - src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); + COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c1 *= src_a9; res_c0 -= res_c1 * src_a8; @@ -523,6 +424,8 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; @@ -546,69 +449,27 @@ static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6_nxt = *(c + 6 + ldc); c7_nxt = *(c + 7 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[16]; - - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[2] * bb[0]; - res[3] = aa[3] * bb[0]; - res[4] = aa[4] * bb[0]; - res[5] = aa[5] * bb[0]; - res[6] = aa[6] * bb[0]; - res[7] = aa[7] * bb[0]; - res[8] = aa[0] * bb[1]; - res[9] = aa[1] * bb[1]; - res[10] = aa[2] * bb[1]; - res[11] = aa[3] * bb[1]; - res[12] = aa[4] * bb[1]; - res[13] = aa[5] * bb[1]; - res[14] = aa[6] * bb[1]; - res[15] = aa[7] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 2; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[2] * bb[0]; - res[3] += aa[3] * bb[0]; - res[4] += aa[4] * bb[0]; - res[5] += aa[5] * bb[0]; - res[6] += aa[6] * bb[0]; - res[7] += aa[7] * bb[0]; - res[8] += aa[0] * bb[1]; - res[9] += aa[1] * bb[1]; - res[10] += aa[2] * bb[1]; - res[11] += aa[3] * bb[1]; - res[12] += aa[4] * bb[1]; - res[13] += aa[5] * bb[1]; - res[14] += aa[6] * bb[1]; - res[15] += aa[7] * bb[1]; - } + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + c2_nxt -= aa[2] * bb[1]; + c3_nxt -= aa[3] * bb[1]; + c4_nxt -= aa[4] * bb[1]; + c5_nxt -= aa[5] * bb[1]; + c6_nxt -= aa[6] * bb[1]; + c7_nxt -= aa[7] * bb[1]; - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c4 -= res[4]; - c5 -= res[5]; - c6 -= res[6]; - c7 -= res[7]; - - c0_nxt -= res[8]; - c1_nxt -= res[9]; - c2_nxt -= res[10]; - c3_nxt -= res[11]; - c4_nxt -= res[12]; - c5_nxt -= res[13]; - c6_nxt -= res[14]; - c7_nxt -= res[15]; + aa += 8; + bb += 2; } a -= 64; @@ -768,6 +629,8 @@ static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; @@ -782,44 +645,19 @@ static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c6 = *(c + 6); c7 = *(c + 7); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3, t4, t5, t6, t7; - - t0 = aa[0] * bb[0]; - t1 = aa[1] * bb[0]; - t2 = aa[2] * bb[0]; - t3 = aa[3] * bb[0]; - t4 = aa[4] * bb[0]; - t5 = aa[5] * bb[0]; - t6 = aa[6] * bb[0]; - t7 = aa[7] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 1; - - t0 += aa[0] * bb[0]; - t1 += aa[1] * bb[0]; - t2 += aa[2] * bb[0]; - t3 += aa[3] * bb[0]; - t4 += aa[4] * bb[0]; - t5 += aa[5] * bb[0]; - t6 += aa[6] * bb[0]; - t7 += aa[7] * bb[0]; - } + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; - c4 -= t4; - c5 -= t5; - c6 -= t6; - c7 -= t7; + aa += 8; + bb += 1; } a -= 64; @@ -927,6 +765,9 @@ static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; @@ -939,79 +780,35 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + for (k = 0; k < bk; k++) + { src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 8; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - } + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); + aa += 4; + bb += 8; } a -= 16; @@ -1028,12 +825,10 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a5 = __msa_cast_to_vector_float(*(a + 5)); - src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a4 = __msa_cast_to_vector_float(*(a + 4)); - src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); + COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c3 *= src_a15; res_c7 *= src_a15; @@ -1079,6 +874,9 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; v4f32 src_a13, src_a14, src_a15; @@ -1086,80 +884,48 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + for (k = 0; k < (bk >> 1); k++) + { src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; - - for (k = ((bk - 1) >> 1); k--;) - { - aa += 4; - bb += 4; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; - src_a0 = LD_SP(aa); + aa += 4; + bb += 4; - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - aa += 4; - bb += 4; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - } - - if ((bk - 1) & 1) - { - aa += 4; - bb += 4; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - } + src_a0 = LD_SP(aa); - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; + aa += 4; + bb += 4; } - else + + if (bk & 1) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; } a -= 16; @@ -1174,12 +940,9 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a5 = __msa_cast_to_vector_float(*(a + 5)); - src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a4 = __msa_cast_to_vector_float(*(a + 4)); - src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); + COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c3 *= src_a15; res_c2 -= res_c3 * src_a14; @@ -1208,6 +971,8 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; @@ -1220,44 +985,19 @@ static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[8]; - - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[2] * bb[0]; - res[3] = aa[3] * bb[0]; - res[4] = aa[0] * bb[1]; - res[5] = aa[1] * bb[1]; - res[6] = aa[2] * bb[1]; - res[7] = aa[3] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 2; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[2] * bb[0]; - res[3] += aa[3] * bb[0]; - res[4] += aa[0] * bb[1]; - res[5] += aa[1] * bb[1]; - res[6] += aa[2] * bb[1]; - res[7] += aa[3] * bb[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c0_nxt -= res[4]; - c1_nxt -= res[5]; - c2_nxt -= res[6]; - c3_nxt -= res[7]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + c2_nxt -= aa[2] * bb[1]; + c3_nxt -= aa[3] * bb[1]; + + aa += 4; + bb += 2; } a -= 16; @@ -1325,6 +1065,8 @@ static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; c0 = *(c + 0); @@ -1332,32 +1074,15 @@ static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3; - - t0 = aa[0] * bb[0]; - t1 = aa[1] * bb[0]; - t2 = aa[2] * bb[0]; - t3 = aa[3] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 1; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; - t0 += aa[0] * bb[0]; - t1 += aa[1] * bb[0]; - t2 += aa[2] * bb[0]; - t3 += aa[3] * bb[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + aa += 4; + bb += 1; } a -= 16; @@ -1401,6 +1126,8 @@ static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3; FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; FLOAT c0_nxt7, c1_nxt7; @@ -1422,68 +1149,27 @@ static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[16]; - - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - res[8] = aa[0] * bb[4]; - res[9] = aa[1] * bb[4]; - res[10] = aa[0] * bb[5]; - res[11] = aa[1] * bb[5]; - res[12] = aa[0] * bb[6]; - res[13] = aa[1] * bb[6]; - res[14] = aa[0] * bb[7]; - res[15] = aa[1] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 8; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - res[8] += aa[0] * bb[4]; - res[9] += aa[1] * bb[4]; - res[10] += aa[0] * bb[5]; - res[11] += aa[1] * bb[5]; - res[12] += aa[0] * bb[6]; - res[13] += aa[1] * bb[6]; - res[14] += aa[0] * bb[7]; - res[15] += aa[1] * bb[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + c0_nxt4 -= aa[0] * bb[4]; + c1_nxt4 -= aa[1] * bb[4]; + c0_nxt5 -= aa[0] * bb[5]; + c1_nxt5 -= aa[1] * bb[5]; + c0_nxt6 -= aa[0] * bb[6]; + c1_nxt6 -= aa[1] * bb[6]; + c0_nxt7 -= aa[0] * bb[7]; + c1_nxt7 -= aa[1] * bb[7]; + + aa += 2; + bb += 8; } a -= 4; @@ -1557,6 +1243,8 @@ static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; @@ -1569,44 +1257,19 @@ static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[8]; - - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 4; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + + aa += 2; + bb += 4; } a -= 4; @@ -1652,6 +1315,8 @@ static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); @@ -1659,32 +1324,15 @@ static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1, res2, res3; - - res0 = aa[0] * bb[0]; - res1 = aa[1] * bb[0]; - res2 = aa[0] * bb[1]; - res3 = aa[1] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 2; - - res0 += aa[0] * bb[0]; - res1 += aa[1] * bb[0]; - res2 += aa[0] * bb[1]; - res3 += aa[1] * bb[1]; - } + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; - c0 -= res0; - c1 -= res1; - c0_nxt -= res2; - c1_nxt -= res3; + aa += 2; + bb += 2; } a -= 4; @@ -1716,31 +1364,20 @@ static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1; - - res0 = aa[0] * bb[0]; - res1 = aa[1] * bb[0]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 1; - - res0 += aa[0] * bb[0]; - res1 += aa[1] * bb[0]; - } - - c0 -= res0; - c1 -= res1; + aa += 2; + bb += 1; } a -= 4; @@ -1764,9 +1401,11 @@ static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7; - c0 = *(c + 0 * ldc); + c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); @@ -1775,44 +1414,19 @@ static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - FLOAT *aa = a, *bb = b; - BLASLONG k; - FLOAT r0, r1, r2, r3, r4, r5, r6, r7; - - r0 = aa[0] * bb[0]; - r1 = aa[0] * bb[1]; - r2 = aa[0] * bb[2]; - r3 = aa[0] * bb[3]; - r4 = aa[0] * bb[4]; - r5 = aa[0] * bb[5]; - r6 = aa[0] * bb[6]; - r7 = aa[0] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 8; - - r0 += aa[0] * bb[0]; - r1 += aa[0] * bb[1]; - r2 += aa[0] * bb[2]; - r3 += aa[0] * bb[3]; - r4 += aa[0] * bb[4]; - r5 += aa[0] * bb[5]; - r6 += aa[0] * bb[6]; - r7 += aa[0] * bb[7]; - } - - c0 -= r0; - c1 -= r1; - c2 -= r2; - c3 -= r3; - c4 -= r4; - c5 -= r5; - c6 -= r6; - c7 -= r7; + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + c4 -= aa[0] * bb[4]; + c5 -= aa[0] * bb[5]; + c6 -= aa[0] * bb[6]; + c7 -= aa[0] * bb[7]; + + aa += 1; + bb += 8; } a0 = *(a - 1); @@ -1845,16 +1459,34 @@ static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 7 * ldc) = c7; } -static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1, c2, c3; + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + a0 = *(a - 1); - c0 = *(c + 0 * ldc) * a0; - c1 = *(c + 1 * ldc) * a0; - c2 = *(c + 2 * ldc) * a0; - c3 = *(c + 3 * ldc) * a0; + c0 *= a0; + c1 *= a0; + c2 *= a0; + c3 *= a0; *(b - 4) = c0; *(b - 3) = c1; @@ -1867,14 +1499,28 @@ static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) *(c + 3 * ldc) = c3; } -static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1; + c0 = *c; + c1 = *(c + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + + aa += 1; + bb += 2; + } + a0 = *(a - 1); - c0 = *(c + 0 * ldc) * a0; - c1 = *(c + 1 * ldc) * a0; + c0 *= a0; + c1 *= a0; *(b - 2) = c0; *(b - 1) = c1; @@ -1883,8 +1529,15 @@ static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) *(c + 1 * ldc) = c1; } -static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + + for (k = 0; k < bk; k++) + { + *c -= a[k] * b[k]; + } + *c *= *(a - 1); *(b - 1) = *c; } @@ -1965,7 +1618,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + (m - 1) * k + kk; cc = c + (m - 1); - ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc); + ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); kk -= 1; } @@ -2023,7 +1676,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + (m - 1) * k + kk; cc = c + (m - 1); - ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc); + ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); kk -= 1; } @@ -2057,7 +1710,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, do { - ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, k -kk); + ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; @@ -2081,7 +1734,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + (m - 1) * k + kk; cc = c + (m - 1); - ssolve_1x1_ln_msa(aa, b + kk, cc); + ssolve_1x1_ln_msa(aa, b + kk, cc, (k - kk)); kk -= 1; } diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c index 0c61d3618..fbce812e6 100644 --- a/kernel/mips/strsm_kernel_LT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; @@ -47,106 +49,43 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; a += 8; b += 8; - - for (k = (bk - 1); k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - - a += 8; - b += 8; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); } TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, @@ -223,8 +162,7 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - src_a31 = __msa_cast_to_vector_float(*(a + 31)); - src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); res_c3 *= src_a27; res_c11 *= src_a27; @@ -278,12 +216,9 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c7 -= res_c5 * src_a47; res_c15 -= res_c13 * src_a47; - src_a54 = __msa_cast_to_vector_float(*(a + 54)); - src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); - src_a55 = __msa_cast_to_vector_float(*(a + 55)); - src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); - src_a63 = __msa_cast_to_vector_float(*(a + 63)); - src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); + COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); + COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); res_c6 *= src_a54; res_c14 *= src_a54; @@ -313,6 +248,8 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; @@ -324,67 +261,28 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + for (k = 0; k < bk; k++) + { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; a += 8; b += 4; - - for (k = (bk - 1); k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - a += 8; - b += 4; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); } TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, @@ -436,8 +334,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - src_a31 = __msa_cast_to_vector_float(*(a + 31)); - src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); res_c3 *= src_a27; res_c4 -= res_c3 * src_a28; @@ -462,12 +359,9 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a54 = __msa_cast_to_vector_float(*(a + 54)); - src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); - src_a55 = __msa_cast_to_vector_float(*(a + 55)); - src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); - src_a63 = __msa_cast_to_vector_float(*(a + 63)); - src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); + COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); + COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); res_c6 *= src_a54; res_c7 -= res_c6 * src_a55; @@ -490,6 +384,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; FLOAT a45, a46, a47, a54, a55, a63; @@ -513,67 +408,24 @@ static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6_nxt = *(c + 6 + ldc); c7_nxt = *(c + 7 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[16]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[2] * b[0]; - res[3] = a[3] * b[0]; - res[4] = a[4] * b[0]; - res[5] = a[5] * b[0]; - res[6] = a[6] * b[0]; - res[7] = a[7] * b[0]; - res[8] = a[0] * b[1]; - res[9] = a[1] * b[1]; - res[10] = a[2] * b[1]; - res[11] = a[3] * b[1]; - res[12] = a[4] * b[1]; - res[13] = a[5] * b[1]; - res[14] = a[6] * b[1]; - res[15] = a[7] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 8; - b += 2; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[2] * b[0]; - res[3] += a[3] * b[0]; - res[4] += a[4] * b[0]; - res[5] += a[5] * b[0]; - res[6] += a[6] * b[0]; - res[7] += a[7] * b[0]; - res[8] += a[0] * b[1]; - res[9] += a[1] * b[1]; - res[10] += a[2] * b[1]; - res[11] += a[3] * b[1]; - res[12] += a[4] * b[1]; - res[13] += a[5] * b[1]; - res[14] += a[6] * b[1]; - res[15] += a[7] * b[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c4 -= res[4]; - c5 -= res[5]; - c6 -= res[6]; - c7 -= res[7]; - c0_nxt -= res[8]; - c1_nxt -= res[9]; - c2_nxt -= res[10]; - c3_nxt -= res[11]; - c4_nxt -= res[12]; - c5_nxt -= res[13]; - c6_nxt -= res[14]; - c7_nxt -= res[15]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + c2_nxt -= a[2] * b[1]; + c3_nxt -= a[3] * b[1]; + c4_nxt -= a[4] * b[1]; + c5_nxt -= a[5] * b[1]; + c6_nxt -= a[6] * b[1]; + c7_nxt -= a[7] * b[1]; a += 8; b += 2; @@ -733,6 +585,7 @@ static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; @@ -746,43 +599,16 @@ static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c6 = *(c + 6); c7 = *(c + 7); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG i; - FLOAT a0, a1, a2, a3, a4, a5, a6, a7; - - a0 = a[0] * b[0]; - a1 = a[1] * b[0]; - a2 = a[2] * b[0]; - a3 = a[3] * b[0]; - a4 = a[4] * b[0]; - a5 = a[5] * b[0]; - a6 = a[6] * b[0]; - a7 = a[7] * b[0]; - - for (i = (bk - 1); i--; ) - { - a += 8; - b += 1; - - a0 += a[0] * b[0]; - a1 += a[1] * b[0]; - a2 += a[2] * b[0]; - a3 += a[3] * b[0]; - a4 += a[4] * b[0]; - a5 += a[5] * b[0]; - a6 += a[6] * b[0]; - a7 += a[7] * b[0]; - } - - c0 -= a0; - c1 -= a1; - c2 -= a2; - c3 -= a3; - c4 -= a4; - c5 -= a5; - c6 -= a6; - c7 -= a7; + a0 += a[0] * b[0]; + a1 += a[1] * b[0]; + a2 += a[2] * b[0]; + a3 += a[3] * b[0]; + a4 += a[4] * b[0]; + a5 += a[5] * b[0]; + a6 += a[6] * b[0]; + a7 += a[7] * b[0]; a += 8; b += 1; @@ -890,6 +716,8 @@ static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; @@ -902,81 +730,76 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + for (k = 0; k < (bk >> 1); k++) + { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; a += 4; b += 8; - for (k = (bk - 1); k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - } + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; + a += 4; + b += 8; } - else + + if (bk & 1) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + a += 4; + b += 8; } TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, @@ -990,12 +813,9 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a10 = __msa_cast_to_vector_float(*(a + 10)); - src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); - src_a11 = __msa_cast_to_vector_float(*(a + 11)); - src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); - src_a15 = __msa_cast_to_vector_float(*(a + 15)); - src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); + COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); + COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); res_c0 *= src_a0; res_c4 *= src_a0; @@ -1041,6 +861,8 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; v4f32 src_a10, src_a11, src_a15, src_a; @@ -1048,82 +870,51 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) - { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + for (k = 0; k < (bk >> 1); k++) + { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; a += 4; b += 4; - for (k = (bk - 1) >> 1; k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - } + src_a0 = LD_SP(a); - if ((bk - 1) & 1) - { - src_a0 = LD_SP(a); + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; + a += 4; + b += 4; + } - a += 4; - b += 4; - } + if (bk & 1) + { + src_a0 = LD_SP(a); - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + a += 4; + b += 4; } TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, @@ -1135,12 +926,9 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a10 = __msa_cast_to_vector_float(*(a + 10)); - src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); - src_a11 = __msa_cast_to_vector_float(*(a + 11)); - src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); - src_a15 = __msa_cast_to_vector_float(*(a + 15)); - src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); + COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); + COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; @@ -1169,6 +957,7 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; @@ -1181,43 +970,16 @@ static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[8]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[2] * b[0]; - res[3] = a[3] * b[0]; - res[4] = a[0] * b[1]; - res[5] = a[1] * b[1]; - res[6] = a[2] * b[1]; - res[7] = a[3] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 4; - b += 2; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[2] * b[0]; - res[3] += a[3] * b[0]; - res[4] += a[0] * b[1]; - res[5] += a[1] * b[1]; - res[6] += a[2] * b[1]; - res[7] += a[3] * b[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c0_nxt -= res[4]; - c1_nxt -= res[5]; - c2_nxt -= res[6]; - c3_nxt -= res[7]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + c2_nxt -= a[2] * b[1]; + c3_nxt -= a[3] * b[1]; a += 4; b += 2; @@ -1285,6 +1047,7 @@ static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; c0 = *(c + 0); @@ -1292,31 +1055,12 @@ static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT t0, t1, t2, t3; - - t0 = a[0] * b[0]; - t1 = a[1] * b[0]; - t2 = a[2] * b[0]; - t3 = a[3] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 4; - b += 1; - - t0 += a[0] * b[0]; - t1 += a[1] * b[0]; - t2 += a[2] * b[0]; - t3 += a[3] * b[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; @@ -1360,6 +1104,7 @@ static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2; FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5; FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7; @@ -1381,67 +1126,24 @@ static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[16]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - res[8] = a[0] * b[4]; - res[9] = a[1] * b[4]; - res[10] = a[0] * b[5]; - res[11] = a[1] * b[5]; - res[12] = a[0] * b[6]; - res[13] = a[1] * b[6]; - res[14] = a[0] * b[7]; - res[15] = a[1] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 8; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - res[8] += a[0] * b[4]; - res[9] += a[1] * b[4]; - res[10] += a[0] * b[5]; - res[11] += a[1] * b[5]; - res[12] += a[0] * b[6]; - res[13] += a[1] * b[6]; - res[14] += a[0] * b[7]; - res[15] += a[1] * b[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + c0_nxt4 -= a[0] * b[4]; + c1_nxt4 -= a[1] * b[4]; + c0_nxt5 -= a[0] * b[5]; + c1_nxt5 -= a[1] * b[5]; + c0_nxt6 -= a[0] * b[6]; + c1_nxt6 -= a[1] * b[6]; + c0_nxt7 -= a[0] * b[7]; + c1_nxt7 -= a[1] * b[7]; a += 2; b += 8; @@ -1512,6 +1214,7 @@ static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; @@ -1524,43 +1227,16 @@ static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[8]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 4; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -1605,6 +1281,7 @@ static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); @@ -1612,32 +1289,12 @@ static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1, res2, res3; - - res0 = a[0] * b[0]; - res1 = a[1] * b[0]; - res2 = a[0] * b[1]; - res3 = a[1] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 2; - - res0 += a[0] * b[0]; - res1 += a[1] * b[0]; - res2 += a[0] * b[1]; - res3 += a[1] * b[1]; - } - - c0 -= res0; - c1 -= res1; - - c0_nxt -= res2; - c1_nxt -= res3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -1667,30 +1324,16 @@ static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[1] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 1; - - res0 += a[0] * b[0]; - res1 += a[1] * b[0]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -1710,69 +1353,64 @@ static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - if (bk > 0) - { - BLASLONG k; - FLOAT c0, c1, c2, c3, c4, c5, c6, c7; - - c0 = a[0] * b[0]; - c1 = a[0] * b[1]; - c2 = a[0] * b[2]; - c3 = a[0] * b[3]; - c4 = a[0] * b[4]; - c5 = a[0] * b[5]; - c6 = a[0] * b[6]; - c7 = a[0] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 8; - - c0 += a[0] * b[0]; - c1 += a[0] * b[1]; - c2 += a[0] * b[2]; - c3 += a[0] * b[3]; - c4 += a[0] * b[4]; - c5 += a[0] * b[5]; - c6 += a[0] * b[6]; - c7 += a[0] * b[7]; - } + BLASLONG k; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; - *(c + 0 * ldc) -= c0; - *(c + 1 * ldc) -= c1; - *(c + 2 * ldc) -= c2; - *(c + 3 * ldc) -= c3; - *(c + 4 * ldc) -= c4; - *(c + 5 * ldc) -= c5; - *(c + 6 * ldc) -= c6; - *(c + 7 * ldc) -= c7; + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + c4 -= a[0] * b[4]; + c5 -= a[0] * b[5]; + c6 -= a[0] * b[6]; + c7 -= a[0] * b[7]; a += 1; b += 8; } - *c *= *a; - *(c + ldc) *= *a; - *(c + 2 * ldc) *= *a; - *(c + 3 * ldc) *= *a; - *(c + 4 * ldc) *= *a; - *(c + 5 * ldc) *= *a; - *(c + 6 * ldc) *= *a; - *(c + 7 * ldc) *= *a; + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; + c4 *= *a; + c5 *= *a; + c6 *= *a; + c7 *= *a; - *b = *c; - *(b + 1) = *(c + ldc); - *(b + 2) = *(c + 2 * ldc); - *(b + 3) = *(c + 3 * ldc); - *(b + 4) = *(c + 4 * ldc); - *(b + 5) = *(c + 5 * ldc); - *(b + 6) = *(c + 6 * ldc); - *(b + 7) = *(c + 7 * ldc); + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; } static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1, c2, c3; c0 = *(c + 0 * ldc); @@ -1780,31 +1418,13 @@ static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1, res2, res3; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - res2 = a[0] * b[2]; - res3 = a[0] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 4; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - res2 += a[0] * b[2]; - res3 += a[0] * b[3]; - } - - c0 -= res0; - c1 -= res1; - c2 -= res2; - c3 -= res3; a += 1; b += 4; } @@ -1827,30 +1447,16 @@ static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1; c0 = *c; c1 = *(c + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 2; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; a += 1; b += 2; @@ -1865,22 +1471,11 @@ static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk) - { - BLASLONG k; - FLOAT res; + BLASLONG k; - res = a[0] * b[0]; - - for (k = (bk - 1); k--;) - { - a++; - b++; - - res += a[0] * b[0]; - } - - *c -= res; + for (k = 0; k < bk; k++) + { + *c -= a[0] * b[0]; a++; b++; diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c index 04bca1b12..69d7b5f72 100644 --- a/kernel/mips/strsm_kernel_RN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_a0, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; @@ -45,105 +47,43 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; a += 8; b += 8; - - for (k = (bk - 1); k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - - a += 8; - b += 8; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); } src_b = LD_SP(b + 0); @@ -204,8 +144,7 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - src_b31 = __msa_cast_to_vector_float(*(b + 31)); - src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); src_c4 *= src_b18; src_c5 *= src_b18; @@ -245,12 +184,9 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - src_b54 = __msa_cast_to_vector_float(*(b + 54)); - src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); - src_b55 = __msa_cast_to_vector_float(*(b + 55)); - src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); - src_b63 = __msa_cast_to_vector_float(*(b + 63)); - src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); + COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); + COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); src_c8 *= src_b36; src_c9 *= src_b36; @@ -291,108 +227,71 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; - v4f32 src_b10, src_b11, src_b15, src_b; + v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + for (k = 0; k < (bk >> 1); k++) + { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; a += 8; b += 4; - for (k = (bk - 1) / 2; k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - a += 8; - b += 4; - - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; + LD_SP2(a, 4, src_a0, src_a1); - a += 8; - b += 4; - } + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; - if ((bk - 1) & 1) - { - LD_SP2(a, 4, src_a0, src_a1); + a += 8; + b += 4; + } - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; + if (bk & 1) + { + LD_SP2(a, 4, src_a0, src_a1); - a += 8; - b += 4; - } + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); + a += 8; + b += 4; } src_b = LD_SP(b + 0); @@ -401,12 +300,9 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b10 = __msa_cast_to_vector_float(*(b + 10)); - src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); - src_b11 = __msa_cast_to_vector_float(*(b + 11)); - src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); - src_b15 = __msa_cast_to_vector_float(*(b + 15)); - src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); + COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); + COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); src_c0 *= src_b0; src_c1 *= src_b0; @@ -443,100 +339,62 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_a0, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk) - { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1, res2, res3; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + for (k = 0; k < (bk >> 1); k++) + { LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; a += 8; b += 2; - for (k = (bk - 1) / 2; k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - - a += 8; - b += 2; - - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; + LD_SP2(a, 4, src_a0, src_a1); - a += 8; - b += 2; - } + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - if ((bk - 1) & 1) - { - LD_SP2(a, 4, src_a0, src_a1); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + a += 8; + b += 2; + } - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; + if (bk & 1) + { + LD_SP2(a, 4, src_a0, src_a1); - a += 8; - b += 2; - } + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); + a += 8; + b += 2; } - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); src_c0 *= src_b0; src_c1 *= src_b0; @@ -552,125 +410,94 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0; + BLASLONG k; + v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; - if (bk) + LD_SP2(c, 4, src_c0, src_c1); + + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1; + LD_SP2(a, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; a += 8; b += 1; - for (k = (bk - 1) >> 2; k--;) - { - LD_SP2(a, 4, src_a0, src_a1); + LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - a += 8; - b += 1; + a += 8; + b += 1; - LD_SP2(a, 4, src_a0, src_a1); + LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - a += 8; - b += 1; + a += 8; + b += 1; + } + if (bk & 3) + { + if (bk & 2) + { LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; a += 8; b += 1; LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; a += 8; b += 1; } - if ((bk - 1) & 3) + if (bk & 1) { - if ((bk - 1) & 2) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - a += 8; - b += 1; - - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - a += 8; - b += 1; - } - - if ((bk - 1) & 1) - { - LD_SP2(a, 4, src_a0, src_a1); + LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - a += 8; - b += 1; - } + a += 8; + b += 1; } - - LD_SP2(c, 4, src_c0, src_c1); - - src_c0 -= res0; - src_c1 -= res1; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); } - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c0 *= src_b0; src_c1 *= src_b0; @@ -681,12 +508,13 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; - v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; @@ -695,121 +523,35 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_a0, res0, res1, res2, res3, res4, res5, res6, res7; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + for (k = 0; k < bk; k++) + { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; a += 4; b += 8; - - for (k = (bk - 1) / 2; k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - } - - if ((bk - 1) & 1) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); } src_b = LD_SP(b + 0); @@ -832,8 +574,7 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - src_b31 = __msa_cast_to_vector_float(*(b + 31)); - src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); src_b = LD_SP(b + 36); SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); @@ -843,12 +584,9 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - src_b54 = __msa_cast_to_vector_float(*(b + 54)); - src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); - src_b55 = __msa_cast_to_vector_float(*(b + 55)); - src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); - src_b63 = __msa_cast_to_vector_float(*(b + 63)); - src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); + COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); + COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -909,87 +647,58 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3; - v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b; + v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_a0, res0, res1, res2, res3; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + for (k = 0; k < (bk >> 1); k++) + { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; a += 4; b += 4; - for (k = ((bk - 1) >> 1); k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - } + src_a0 = LD_SP(a); - if ((bk - 1) & 1) - { - src_a0 = LD_SP(a); + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; + a += 4; + b += 4; + } - a += 4; - b += 4; - } + if (bk & 1) + { + src_a0 = LD_SP(a); - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + a += 4; + b += 4; } src_b = LD_SP(b + 0); @@ -998,12 +707,9 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b10 = __msa_cast_to_vector_float(*(b + 10)); - src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); - src_b11 = __msa_cast_to_vector_float(*(b + 11)); - src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); - src_b15 = __msa_cast_to_vector_float(*(b + 15)); - src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); + COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); + COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -1029,34 +735,71 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0, src_b1, src_b3; + BLASLONG k; + v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - v4f32 src_a, res0, res1; + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 = src_a * src_b0; - res1 = src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; a += 4; b += 2; - for (k = ((bk - 1) >> 1); k--;) + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + } + + if (bk & 3) + { + if (bk & 2) { src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; a += 4; b += 2; @@ -1066,45 +809,31 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; a += 4; b += 2; } - if ((bk - 1) & 1) + if (bk & 1) { src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; a += 4; b += 2; } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - - src_c0 -= res0; - src_c1 -= res1; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); } - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -1118,6 +847,7 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, c0, c1, c2, c3; c0 = *(c + 0); @@ -1125,31 +855,12 @@ static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2); c3 = *(c + 3); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT t0, t1, t2, t3; - - t0 = a[0] * b[0]; - t1 = a[1] * b[0]; - t2 = a[2] * b[0]; - t3 = a[3] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 4; - b += 1; - - t0 += a[0] * b[0]; - t1 += a[1] * b[0]; - t2 += a[2] * b[0]; - t3 += a[3] * b[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; @@ -1175,6 +886,7 @@ static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31; FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63; @@ -1199,67 +911,24 @@ static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[16]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - res[8] = a[0] * b[4]; - res[9] = a[1] * b[4]; - res[10] = a[0] * b[5]; - res[11] = a[1] * b[5]; - res[12] = a[0] * b[6]; - res[13] = a[1] * b[6]; - res[14] = a[0] * b[7]; - res[15] = a[1] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 8; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - res[8] += a[0] * b[4]; - res[9] += a[1] * b[4]; - res[10] += a[0] * b[5]; - res[11] += a[1] * b[5]; - res[12] += a[0] * b[6]; - res[13] += a[1] * b[6]; - res[14] += a[0] * b[7]; - res[15] += a[1] * b[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + c0_nxt4 -= a[0] * b[4]; + c1_nxt4 -= a[1] * b[4]; + c0_nxt5 -= a[0] * b[5]; + c1_nxt5 -= a[1] * b[5]; + c0_nxt6 -= a[0] * b[6]; + c1_nxt6 -= a[1] * b[6]; + c0_nxt7 -= a[0] * b[7]; + c1_nxt7 -= a[1] * b[7]; a += 2; b += 8; @@ -1447,6 +1116,7 @@ static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1; FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3; @@ -1459,43 +1129,16 @@ static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[8]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 4; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -1557,6 +1200,7 @@ static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; c0 = *(c + 0); @@ -1564,31 +1208,12 @@ static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[4]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 2; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt -= res[2]; - c1_nxt -= res[3]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -1620,30 +1245,16 @@ static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[1] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 1; - - res0 += a[0] * b[0]; - res1 += a[1] * b[0]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -1663,6 +1274,7 @@ static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38; FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7; @@ -1676,43 +1288,16 @@ static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT t0, t1, t2, t3, t4, t5, t6, t7; - - t0 = a[0] * b[0]; - t1 = a[0] * b[1]; - t2 = a[0] * b[2]; - t3 = a[0] * b[3]; - t4 = a[0] * b[4]; - t5 = a[0] * b[5]; - t6 = a[0] * b[6]; - t7 = a[0] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 8; - - t0 += a[0] * b[0]; - t1 += a[0] * b[1]; - t2 += a[0] * b[2]; - t3 += a[0] * b[3]; - t4 += a[0] * b[4]; - t5 += a[0] * b[5]; - t6 += a[0] * b[6]; - t7 += a[0] * b[7]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; - c4 -= t4; - c5 -= t5; - c6 -= t6; - c7 -= t7; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + c4 -= a[0] * b[4]; + c5 -= a[0] * b[5]; + c6 -= a[0] * b[6]; + c7 -= a[0] * b[7]; a += 1; b += 8; @@ -1820,6 +1405,7 @@ static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; c0 = *(c + 0); @@ -1827,31 +1413,12 @@ static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1, res2, res3; - - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - res2 = a[0] * b[2]; - res3 = a[0] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 4; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - res2 += a[0] * b[2]; - res3 += a[0] * b[3]; - } - - c0 -= res0; - c1 -= res1; - c2 -= res2; - c3 -= res3; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; a += 1; b += 4; @@ -1895,30 +1462,16 @@ static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b3, c0, c1; - c0 = *(c + 0); + c0 = *c; c1 = *(c + ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 2; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; a += 1; b += 2; @@ -1942,22 +1495,11 @@ static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk) - { - BLASLONG k; - FLOAT res; - - res = a[0] * b[0]; - - for (k = (bk - 1); k--;) - { - a++; - b++; - - res += a[0] * b[0]; - } + BLASLONG k; - *c -= res; + for (k = 0; k < bk; k++) + { + *c -= a[0] * b[0]; a++; b++; diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c index 25a8a0b6e..eefd3a665 100644 --- a/kernel/mips/strsm_kernel_RT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -30,6 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; @@ -45,104 +48,43 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; - - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 8; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + aa += 8; + bb += 8; } b -= 64; @@ -216,8 +158,7 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - src_b36 = __msa_cast_to_vector_float(*(b + 36)); - src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); src_c8 *= src_b36; src_c9 *= src_b36; @@ -262,12 +203,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); - src_b9 = __msa_cast_to_vector_float(*(b + 9)); - src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); - src_b8 = __msa_cast_to_vector_float(*(b + 8)); - src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); + COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c2 *= src_b9; src_c3 *= src_b9; @@ -285,6 +223,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12; v4f32 src_b13, src_b14, src_b15; @@ -292,103 +233,60 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + for (k = 0; k < (bk >> 1); k++) + { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; - - for (k = (bk - 1) / 2; k--;) - { - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - } - - if ((bk - 1) & 1) - { - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - } + LD_SP2(aa, 4, src_a0, src_a1); - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; } - else + + if (bk & 1) { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; } a -= 32; @@ -400,12 +298,9 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b5 = __msa_cast_to_vector_float(*(b + 5)); - src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b4 = __msa_cast_to_vector_float(*(b + 4)); - src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); + COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c7 *= src_b15; src_c6 *= src_b15; @@ -442,101 +337,63 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, src_b1, res0, res1, res2, res3; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + for (k = 0; k < (bk >> 1); k++) + { LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; - for (k = (bk - 1) >> 1; k--;) - { - aa += 8; - bb += 2; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - - aa += 8; - bb += 2; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + aa += 8; + bb += 2; - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - } + LD_SP2(aa, 4, src_a0, src_a1); - if ((bk - 1) & 1) - { - aa += 8; - bb += 2; + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); - LD_SP2(aa, 4, src_a0, src_a1); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + aa += 8; + bb += 2; + } - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - } + if (bk & 1) + { + LD_SP2(aa, 4, src_a0, src_a1); - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; } a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b2 = __msa_cast_to_vector_float(*(b + 2)); - src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); src_c2 *= src_b3; src_c3 *= src_b3; @@ -552,126 +409,95 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0; + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; + + LD_SP2(c, 4, src_c0, src_c1); - if (bk > 0) + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, res0, res1; + LD_SP2(aa, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - for (k = (bk - 1) >> 2; k--;) - { - aa += 8; - bb += 1; + aa += 8; + bb += 1; - LD_SP2(aa, 4, src_a0, src_a1); + LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - aa += 8; - bb += 1; + aa += 8; + bb += 1; - LD_SP2(aa, 4, src_a0, src_a1); + LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - aa += 8; - bb += 1; + aa += 8; + bb += 1; + } + if (bk & 3) + { + if (bk & 2) + { LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; aa += 8; bb += 1; LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; } - if ((bk - 1) & 3) + if (bk & 1) { - if ((bk - 1) & 2) - { - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - } - - if ((bk - 1) & 1) - { - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); + LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - } + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; } - - LD_SP2(c, 4, src_c0, src_c1); - - src_c0 -= res0; - src_c1 -= res1; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); } a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c0 *= src_b0; src_c1 *= src_b0; @@ -682,6 +508,9 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; @@ -696,79 +525,35 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + for (k = 0; k < bk; k++) + { src_a0 = LD_SP(aa); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 8; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - } + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); + aa += 4; + bb += 8; } a -= 32; @@ -794,8 +579,7 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - src_b36 = __msa_cast_to_vector_float(*(b + 36)); - src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); src_b = LD_SP(b + 24); SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); @@ -805,12 +589,9 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); - src_b9 = __msa_cast_to_vector_float(*(b + 9)); - src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); - src_b8 = __msa_cast_to_vector_float(*(b + 8)); - src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); + COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c7 *= src_b63; src_c6 -= src_c7 * src_b62; @@ -871,89 +652,60 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; v4f32 src_c0, src_c1, src_c2, src_c3, src_b; v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; - v4f32 src_b14, src_b15; + v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a, src_b1, src_b2, src_b3, res0, res1, res2, res3; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + for (k = 0; k < (bk >> 1); k++) + { src_a = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a * src_b0; - res1 = src_a * src_b1; - res2 = src_a * src_b2; - res3 = src_a * src_b3; - - for (k = ((bk - 1) >> 1); k--;) - { - aa += 4; - bb += 4; - - src_a = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - - res0 += src_a * src_b0; - res1 += src_a * src_b1; - res2 += src_a * src_b2; - res3 += src_a * src_b3; - - aa += 4; - bb += 4; - src_a = LD_SP(aa); + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + aa += 4; + bb += 4; - res0 += src_a * src_b0; - res1 += src_a * src_b1; - res2 += src_a * src_b2; - res3 += src_a * src_b3; - } + src_a = LD_SP(aa); - if ((bk - 1) & 1) - { - aa += 4; - bb += 4; + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_a = LD_SP(aa); + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + aa += 4; + bb += 4; + } - res0 += src_a * src_b0; - res1 += src_a * src_b1; - res2 += src_a * src_b2; - res3 += src_a * src_b3; - } + if (bk & 1) + { + src_a = LD_SP(aa); - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; } a -= 16; @@ -965,12 +717,9 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b5 = __msa_cast_to_vector_float(*(b + 5)); - src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b4 = __msa_cast_to_vector_float(*(b + 4)); - src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); + COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c3 *= src_b15; src_c2 -= src_c3 * src_b14; @@ -996,35 +745,72 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0, src_b2, src_b3; + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk > 0) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a, src_b1, res0, res1; + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 = src_a * src_b0; - res1 = src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; - for (k = ((bk - 1) >> 1); k--;) - { - aa += 4; - bb += 2; + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + aa += 4; + bb += 2; + } + + if (bk & 3) + { + if (bk & 2) + { src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; aa += 4; bb += 2; @@ -1034,45 +820,31 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; - } + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; - if ((bk - 1) & 1) - { aa += 4; bb += 2; + } + if (bk & 1) + { src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - - src_c0 -= res0; - src_c1 -= res1; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); } a -= 8; b -= 4; - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); - src_b2 = __msa_cast_to_vector_float(*(b + 2)); - src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c1 *= src_b3; src_c0 -= src_c1 * src_b2; @@ -1086,6 +858,8 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, c0, c1, c2, c3; c0 = *(c + 0); @@ -1093,32 +867,15 @@ static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; - t0 = aa[0] * bb[0]; - t1 = aa[1] * bb[0]; - t2 = aa[2] * bb[0]; - t3 = aa[3] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 1; - - t0 += aa[0] * bb[0]; - t1 += aa[1] * bb[0]; - t2 += aa[2] * bb[0]; - t3 += aa[3] * bb[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + aa += 4; + bb += 1; } a -= 4; @@ -1144,6 +901,8 @@ static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7; @@ -1167,68 +926,27 @@ static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[16]; - - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - res[8] = aa[0] * bb[4]; - res[9] = aa[1] * bb[4]; - res[10] = aa[0] * bb[5]; - res[11] = aa[1] * bb[5]; - res[12] = aa[0] * bb[6]; - res[13] = aa[1] * bb[6]; - res[14] = aa[0] * bb[7]; - res[15] = aa[1] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 8; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - res[8] += aa[0] * bb[4]; - res[9] += aa[1] * bb[4]; - res[10] += aa[0] * bb[5]; - res[11] += aa[1] * bb[5]; - res[12] += aa[0] * bb[6]; - res[13] += aa[1] * bb[6]; - res[14] += aa[0] * bb[7]; - res[15] += aa[1] * bb[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + c0_nxt4 -= aa[0] * bb[4]; + c1_nxt4 -= aa[1] * bb[4]; + c0_nxt5 -= aa[0] * bb[5]; + c1_nxt5 -= aa[1] * bb[5]; + c0_nxt6 -= aa[0] * bb[6]; + c1_nxt6 -= aa[1] * bb[6]; + c0_nxt7 -= aa[0] * bb[7]; + c1_nxt7 -= aa[1] * bb[7]; + + aa += 2; + bb += 8; } a -= 16; @@ -1416,6 +1134,8 @@ static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; @@ -1428,44 +1148,19 @@ static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[8]; - - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 4; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + + aa += 2; + bb += 4; } a -= 8; @@ -1515,6 +1210,8 @@ static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); @@ -1522,32 +1219,15 @@ static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[4]; - - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 2; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt -= res[2]; - c1_nxt -= res[3]; + aa += 2; + bb += 2; } a -= 4; @@ -1579,31 +1259,20 @@ static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1; - - res0 = aa[0] * bb[0]; - res1 = aa[1] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 1; - - res0 += aa[0] * bb[0]; - res1 += aa[1] * bb[0]; - } + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - c0 -= res0; - c1 -= res1; + aa += 2; + bb += 1; } a -= 2; @@ -1623,6 +1292,8 @@ static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; FLOAT b56, b57, b58, b59, b60, b61, b62, b63; @@ -1637,44 +1308,19 @@ static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3, t4, t5, t6, t7; - - t0 = aa[0] * bb[0]; - t1 = aa[0] * bb[1]; - t2 = aa[0] * bb[2]; - t3 = aa[0] * bb[3]; - t4 = aa[0] * bb[4]; - t5 = aa[0] * bb[5]; - t6 = aa[0] * bb[6]; - t7 = aa[0] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 8; - - t0 += aa[0] * bb[0]; - t1 += aa[0] * bb[1]; - t2 += aa[0] * bb[2]; - t3 += aa[0] * bb[3]; - t4 += aa[0] * bb[4]; - t5 += aa[0] * bb[5]; - t6 += aa[0] * bb[6]; - t7 += aa[0] * bb[7]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; - c4 -= t4; - c5 -= t5; - c6 -= t6; - c7 -= t7; + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + c4 -= aa[0] * bb[4]; + c5 -= aa[0] * bb[5]; + c6 -= aa[0] * bb[6]; + c7 -= aa[0] * bb[7]; + + aa += 1; + bb += 8; } a -= 8; @@ -1782,6 +1428,8 @@ static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; FLOAT c0, c1, c2, c3; @@ -1790,32 +1438,15 @@ static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1, res2, res3; - - res0 = aa[0] * bb[0]; - res1 = aa[0] * bb[1]; - res2 = aa[0] * bb[2]; - res3 = aa[0] * bb[3]; + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 4; - - res0 += aa[0] * bb[0]; - res1 += aa[0] * bb[1]; - res2 += aa[0] * bb[2]; - res3 += aa[0] * bb[3]; - } - - c0 -= res0; - c1 -= res1; - c2 -= res2; - c3 -= res3; + aa += 1; + bb += 4; } a -= 4; @@ -1850,31 +1481,20 @@ static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b2, b3, c0, c1; c0 = *(c + 0); c1 = *(c + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1; - - res0 = aa[0] * bb[0]; - res1 = aa[0] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 2; - - res0 += aa[0] * bb[0]; - res1 += aa[0] * bb[1]; - } + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; - c0 -= res0; - c1 -= res1; + aa += 1; + bb += 2; } a -= 2; @@ -1898,23 +1518,11 @@ static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res; - - res = *aa * *bb; - - for (k = (bk - 1); k--;) - { - aa++; - bb++; + BLASLONG k; - res += *aa * *bb; - } - - *c -= res; + for (k = 0; k < bk; k++) + { + *c -= a[k] * b[k]; } *c *= *(a - 1); From 011431b9d790704358bdb4f257c1cce07423544c Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Tue, 31 May 2016 10:17:23 +0530 Subject: [PATCH 46/70] STRSM optimized for MSA Signed-off-by: Kaustubh Raste --- kernel/mips/strsm_kernel_LT_8x8_msa.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c index fbce812e6..c087fdae5 100644 --- a/kernel/mips/strsm_kernel_LT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -601,14 +601,14 @@ static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) for (k = 0; k < bk; k++) { - a0 += a[0] * b[0]; - a1 += a[1] * b[0]; - a2 += a[2] * b[0]; - a3 += a[3] * b[0]; - a4 += a[4] * b[0]; - a5 += a[5] * b[0]; - a6 += a[6] * b[0]; - a7 += a[7] * b[0]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; a += 8; b += 1; From 41000c8443452b7cad0fa85898d44735aabd3cff Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 31 May 2016 12:53:07 +0200 Subject: [PATCH 47/70] added directory for optimized lapack fortan codes and added dlaqr5.f --- interface/lapack/fortran/dlaqr5.f | 1083 +++++++++++++++++++++++++++++ 1 file changed, 1083 insertions(+) create mode 100644 interface/lapack/fortran/dlaqr5.f diff --git a/interface/lapack/fortran/dlaqr5.f b/interface/lapack/fortran/dlaqr5.f new file mode 100644 index 000000000..a8fad0a79 --- /dev/null +++ b/interface/lapack/fortran/dlaqr5.f @@ -0,0 +1,1083 @@ +! Copyright (c) 2013-2016, The OpenBLAS Project +! All rights reserved. +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions are +! met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions and the following disclaimer in +! the documentation and/or other materials provided with the +! distribution. +! 3. Neither the name of the OpenBLAS project nor the names of +! its contributors may be used to endorse or promote products +! derived from this software without specific prior written permission. +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +! ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +! DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +! SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +! OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +! USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*> \brief \b DLAQR5 performs a single small-bulge multi-shift QR sweep. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQR5 + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, +* SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, +* LDU, NV, WV, LDWV, NH, WH, LDWH ) +* +* .. Scalar Arguments .. +* INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, +* $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV +* LOGICAL WANTT, WANTZ +* .. +* .. Array Arguments .. +* DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), +* $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), +* $ Z( LDZ, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQR5, called by DLAQR0, performs a +*> single small-bulge multi-shift QR sweep. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] WANTT +*> \verbatim +*> WANTT is logical scalar +*> WANTT = .true. if the quasi-triangular Schur factor +*> is being computed. WANTT is set to .false. otherwise. +*> \endverbatim +*> +*> \param[in] WANTZ +*> \verbatim +*> WANTZ is logical scalar +*> WANTZ = .true. if the orthogonal Schur factor is being +*> computed. WANTZ is set to .false. otherwise. +*> \endverbatim +*> +*> \param[in] KACC22 +*> \verbatim +*> KACC22 is integer with value 0, 1, or 2. +*> Specifies the computation mode of far-from-diagonal +*> orthogonal updates. +*> = 0: DLAQR5 does not accumulate reflections and does not +*> use matrix-matrix multiply to update far-from-diagonal +*> matrix entries. +*> = 1: DLAQR5 accumulates reflections and uses matrix-matrix +*> multiply to update the far-from-diagonal matrix entries. +*> = 2: DLAQR5 accumulates reflections, uses matrix-matrix +*> multiply to update the far-from-diagonal matrix entries, +*> and takes advantage of 2-by-2 block structure during +*> matrix multiplies. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is integer scalar +*> N is the order of the Hessenberg matrix H upon which this +*> subroutine operates. +*> \endverbatim +*> +*> \param[in] KTOP +*> \verbatim +*> KTOP is integer scalar +*> \endverbatim +*> +*> \param[in] KBOT +*> \verbatim +*> KBOT is integer scalar +*> These are the first and last rows and columns of an +*> isolated diagonal block upon which the QR sweep is to be +*> applied. It is assumed without a check that +*> either KTOP = 1 or H(KTOP,KTOP-1) = 0 +*> and +*> either KBOT = N or H(KBOT+1,KBOT) = 0. +*> \endverbatim +*> +*> \param[in] NSHFTS +*> \verbatim +*> NSHFTS is integer scalar +*> NSHFTS gives the number of simultaneous shifts. NSHFTS +*> must be positive and even. +*> \endverbatim +*> +*> \param[in,out] SR +*> \verbatim +*> SR is DOUBLE PRECISION array of size (NSHFTS) +*> \endverbatim +*> +*> \param[in,out] SI +*> \verbatim +*> SI is DOUBLE PRECISION array of size (NSHFTS) +*> SR contains the real parts and SI contains the imaginary +*> parts of the NSHFTS shifts of origin that define the +*> multi-shift QR sweep. On output SR and SI may be +*> reordered. +*> \endverbatim +*> +*> \param[in,out] H +*> \verbatim +*> H is DOUBLE PRECISION array of size (LDH,N) +*> On input H contains a Hessenberg matrix. On output a +*> multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied +*> to the isolated diagonal block in rows and columns KTOP +*> through KBOT. +*> \endverbatim +*> +*> \param[in] LDH +*> \verbatim +*> LDH is integer scalar +*> LDH is the leading dimension of H just as declared in the +*> calling procedure. LDH.GE.MAX(1,N). +*> \endverbatim +*> +*> \param[in] ILOZ +*> \verbatim +*> ILOZ is INTEGER +*> \endverbatim +*> +*> \param[in] IHIZ +*> \verbatim +*> IHIZ is INTEGER +*> Specify the rows of Z to which transformations must be +*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N +*> \endverbatim +*> +*> \param[in,out] Z +*> \verbatim +*> Z is DOUBLE PRECISION array of size (LDZ,IHI) +*> If WANTZ = .TRUE., then the QR Sweep orthogonal +*> similarity transformation is accumulated into +*> Z(ILOZ:IHIZ,ILO:IHI) from the right. +*> If WANTZ = .FALSE., then Z is unreferenced. +*> \endverbatim +*> +*> \param[in] LDZ +*> \verbatim +*> LDZ is integer scalar +*> LDA is the leading dimension of Z just as declared in +*> the calling procedure. LDZ.GE.N. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is DOUBLE PRECISION array of size (LDV,NSHFTS/2) +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is integer scalar +*> LDV is the leading dimension of V as declared in the +*> calling procedure. LDV.GE.3. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is DOUBLE PRECISION array of size +*> (LDU,3*NSHFTS-3) +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is integer scalar +*> LDU is the leading dimension of U just as declared in the +*> in the calling subroutine. LDU.GE.3*NSHFTS-3. +*> \endverbatim +*> +*> \param[in] NH +*> \verbatim +*> NH is integer scalar +*> NH is the number of columns in array WH available for +*> workspace. NH.GE.1. +*> \endverbatim +*> +*> \param[out] WH +*> \verbatim +*> WH is DOUBLE PRECISION array of size (LDWH,NH) +*> \endverbatim +*> +*> \param[in] LDWH +*> \verbatim +*> LDWH is integer scalar +*> Leading dimension of WH just as declared in the +*> calling procedure. LDWH.GE.3*NSHFTS-3. +*> \endverbatim +*> +*> \param[in] NV +*> \verbatim +*> NV is integer scalar +*> NV is the number of rows in WV agailable for workspace. +*> NV.GE.1. +*> \endverbatim +*> +*> \param[out] WV +*> \verbatim +*> WV is DOUBLE PRECISION array of size +*> (LDWV,3*NSHFTS-3) +*> \endverbatim +*> +*> \param[in] LDWV +*> \verbatim +*> LDWV is integer scalar +*> LDWV is the leading dimension of WV as declared in the +*> in the calling subroutine. LDWV.GE.NV. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date September 2012 +* +*> \ingroup doubleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> Karen Braman and Ralph Byers, Department of Mathematics, +*> University of Kansas, USA +* +*> \par References: +* ================ +*> +*> K. Braman, R. Byers and R. Mathias, The Multi-Shift QR +*> Algorithm Part I: Maintaining Well Focused Shifts, and Level 3 +*> Performance, SIAM Journal of Matrix Analysis, volume 23, pages +*> 929--947, 2002. +*> +* ===================================================================== + SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, + $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, + $ LDU, NV, WV, LDWV, NH, WH, LDWH ) +* +* -- LAPACK auxiliary routine (version 3.4.2) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* September 2012 +* +* .. Scalar Arguments .. + INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, + $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV + LOGICAL WANTT, WANTZ +* .. +* .. Array Arguments .. + DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), + $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), + $ Z( LDZ, * ) +* .. +* +* ================================================================ +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0d0, ONE = 1.0d0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, + $ ULP + INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, + $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + $ NS, NU + LOGICAL ACCUM, BLK22, BMP22 +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Intrinsic Functions .. +* + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Local Arrays .. + DOUBLE PRECISION VT( 3 ) +* temp scalars + DOUBLE PRECISION tempv1, tempv2, tempv3, + $ tempv4, tempv5, tempv6, + $ temph1, temph2, temph3, + $ temph4, temph5, temph6, + $ tempz1, tempz2, tempz3, + $ tempz4, tempz5, tempz6, + $ tempu1, tempu2, tempu3, + $ tempu4, tempu5, tempu6, + $ REFSU1 + INTEGER JBEGIN, M1 +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLABAD, DLACPY, DLAQR1, DLARFG, DLASET, + $ DTRMM +* .. +* .. Executable Statements .. +* +* ==== If there are no shifts, then there is nothing to do. ==== +* + IF( NSHFTS.LT.2 ) + $ RETURN +* +* ==== If the active block is empty or 1-by-1, then there +* . is nothing to do. ==== +* + IF( KTOP.GE.KBOT ) + $ RETURN +* +* ==== Shuffle shifts into pairs of real shifts and pairs +* . of complex conjugate shifts assuming complex +* . conjugate shifts are already adjacent to one +* . another. ==== +* + DO 10 I = 1, NSHFTS - 2, 2 + IF( SI( I ).NE.-SI( I+1 ) ) THEN +* + SWAP = SR( I ) + SR( I ) = SR( I+1 ) + SR( I+1 ) = SR( I+2 ) + SR( I+2 ) = SWAP +* + SWAP = SI( I ) + SI( I ) = SI( I+1 ) + SI( I+1 ) = SI( I+2 ) + SI( I+2 ) = SWAP + END IF + 10 CONTINUE +* +* ==== NSHFTS is supposed to be even, but if it is odd, +* . then simply reduce it by one. The shuffle above +* . ensures that the dropped shift is real and that +* . the remaining shifts are paired. ==== +* + NS = NSHFTS - MOD( NSHFTS, 2 ) +* +* ==== Machine constants for deflation ==== +* + SAFMIN = DLAMCH( 'SAFE MINIMUM' ) + SAFMAX = ONE / SAFMIN + CALL DLABAD( SAFMIN, SAFMAX ) + ULP = DLAMCH( 'PRECISION' ) + SMLNUM = SAFMIN*( DBLE( N ) / ULP ) +* +* ==== Use accumulated reflections to update far-from-diagonal +* . entries ? ==== +* + ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) +* +* ==== If so, exploit the 2-by-2 block structure? ==== +* + BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) +* +* ==== clear trash ==== +* + IF( KTOP+2.LE.KBOT ) + $ H( KTOP+2, KTOP ) = ZERO +* +* ==== NBMPS = number of 2-shift bulges in the chain ==== +* + NBMPS = NS / 2 +* +* ==== KDU = width of slab ==== +* + KDU = 6*NBMPS - 3 +* +* ==== Create and chase chains of NBMPS bulges ==== +* + DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + NDCOL = INCOL + KDU + IF( ACCUM ) + $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) +* +* ==== Near-the-diagonal bulge chase. The following loop +* . performs the near-the-diagonal part of a small bulge +* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . chunk extends from column INCOL to column NDCOL +* . (including both column INCOL and column NDCOL). The +* . following loop chases a 3*NBMPS column long chain of +* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . may be less than KTOP and and NDCOL may be greater than +* . KBOT indicating phantom columns from which to chase +* . bulges before they are actually introduced or to which +* . to chase bulges beyond column KBOT.) ==== +* + DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) +* +* ==== Bulges number MTOP to MBOT are active double implicit +* . shift bulges. There may or may not also be small +* . 2-by-2 bulge, if there is room. The inactive bulges +* . (if any) must wait until the active bulges have moved +* . down the diagonal to make room. The phantom matrix +* . paradigm described above helps keep track. ==== +* + MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + M22 = MBOT + 1 + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + $ ( KBOT-2 ) +* +* ==== Generate reflections to chase the chain right +* . one column. (The minimum value of K is KTOP-1.) ==== +* + DO 20 M = MTOP, MBOT + K = KRCOL + 3*( M-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), + $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), + $ V( 1, M ) ) + ALPHA = V( 1, M ) + CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M ) = H( K+2, K ) + V( 3, M ) = H( K+3, K ) + CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) +* +* ==== A Bulge may collapse because of vigilant +* . deflation or destructive underflow. In the +* . underflow case, try the two-small-subdiagonals +* . trick to try to reinflate the bulge. ==== +* + IF( H( K+3, K ).NE.ZERO .OR. H( K+3, K+1 ).NE. + $ ZERO .OR. H( K+3, K+2 ).EQ.ZERO ) THEN +* +* ==== Typical case: not collapsed (yet). ==== +* + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + H( K+3, K ) = ZERO + ELSE +* +* ==== Atypical case: collapsed. Attempt to +* . reintroduce ignoring H(K+1,K) and H(K+2,K). +* . If the fill resulting from the new +* . reflector is too large, then abandon it. +* . Otherwise, use the new one. ==== +* + CALL DLAQR1( 3, H( K+1, K+1 ), LDH, SR( 2*M-1 ), + $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), + $ VT ) + ALPHA = VT( 1 ) + CALL DLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) + REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )* + $ H( K+2, K ) ) +* + IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+ + $ ABS( REFSUM*VT( 3 ) ).GT.ULP* + $ ( ABS( H( K, K ) )+ABS( H( K+1, + $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN +* +* ==== Starting a new bulge here would +* . create non-negligible fill. Use +* . the old one with trepidation. ==== +* + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + H( K+3, K ) = ZERO + ELSE +* +* ==== Stating a new bulge here would +* . create only negligible fill. +* . Replace the old reflector with +* . the new one. ==== +* + H( K+1, K ) = H( K+1, K ) - REFSUM + H( K+2, K ) = ZERO + H( K+3, K ) = ZERO + V( 1, M ) = VT( 1 ) + V( 2, M ) = VT( 2 ) + V( 3, M ) = VT( 3 ) + END IF + END IF + END IF + 20 CONTINUE +* +* ==== Generate a 2-by-2 reflection, if needed. ==== +* + K = KRCOL + 3*( M22-1 ) + IF( BMP22 ) THEN + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + END IF +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = MAX( KTOP, KRCOL ), JBOT + MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) + + DO 30 M = MTOP, MEND + + M1 = M -1 + + tempv1 = V( 1, M ) + K = KRCOL + 2*M1 + tempv2 = V( 2, M ) + K = K + M1 + tempv3 = V( 3, M ) + temph1 = H( K+1, J ) + temph2 = H( K+2, J ) + temph3 = H( K+3, J ) + + REFSUM = tempv1*( temph1+tempv2* + $ temph2+tempv3*temph3 ) + + + H( K+1, J ) = temph1 - REFSUM + H( K+2, J ) = temph2 - REFSUM*tempv2 + H( K+3, J ) = temph3 - REFSUM*tempv3 + + 30 CONTINUE + + 40 CONTINUE + IF( BMP22 ) THEN + K = KRCOL + 3*( M22-1 ) + DO 50 J = MAX( K+1, KTOP ), JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + END IF +* +* ==== Multiply H by reflections from the right. +* . Delay filling in the last row until the +* . vigilant deflation check is complete. ==== +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF + DO 90 M = MTOP, MBOT + IF( V( 1, M ).NE.ZERO ) THEN + tempv1 = V( 1, M ) + tempv2 = V( 2, M ) + tempv3 = V( 3, M ) + K = KRCOL + 3*( M-1 ) + JBEGIN = JTOP + + IF ( MOD( MIN( KBOT, K+3 )-JTOP+1, 2).GT.0 ) THEN + J = JBEGIN + + temph1 = H( J, K+1 ) + temph2 = H( J, K+2 ) + temph3 = H( J, K+3 ) + REFSUM = tempv1* ( temph1+tempv2*temph2+ + $ tempv3*temph3 ) + H( J, K+1 ) = temph1 - REFSUM + H( J, K+2 ) = temph2 - REFSUM*tempv2 + H( J, K+3 ) = temph3 - REFSUM*tempv3 + + JBEGIN = JBEGIN + 1 + + END IF + + + DO 60 J = JBEGIN, MIN( KBOT, K+3 ), 2 + + temph1 = H( J, K+1 ) + temph4 = H( J+1, K+1 ) + temph2 = H( J, K+2 ) + temph5 = H( J+1, K+2 ) + temph3 = H( J, K+3 ) + temph6 = H( J+1, K+3 ) + + REFSUM = tempv1* ( temph1+tempv2*temph2+ + $ tempv3*temph3 ) + + REFSU1 = tempv1* ( temph4+tempv2*temph5+ + $ tempv3*temph6 ) + + H( J, K+1 ) = temph1 - REFSUM + H( J+1, K+1 ) = temph4 - REFSU1 + H( J, K+2 ) = temph2 - REFSUM*tempv2 + H( J+1, K+2 ) = temph5 - REFSU1*tempv2 + H( J, K+3 ) = temph3 - REFSUM*tempv3 + H( J+1, K+3 ) = temph6 - REFSU1*tempv3 + + 60 CONTINUE +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If necessary, update Z later +* . with with an efficient matrix-matrix +* . multiply.) ==== +* + KMS = K - INCOL + JBEGIN=MAX( 1, KTOP-INCOL ) + + IF ( MOD(KDU-JBEGIN+1,2).GT.0 ) THEN + J = JBEGIN + tempu1 = U( J, KMS+1 ) + tempu2 = U( J, KMS+2 ) + tempu3 = U( J, KMS+3 ) + REFSUM = tempv1* ( tempu1+tempv2*tempu2+ + $ tempv3*tempu3 ) + U( J, KMS+1 ) = tempu1 - REFSUM + U( J, KMS+2 ) = tempu2 - REFSUM*tempv2 + U( J, KMS+3 ) = tempu3 - REFSUM*tempv3 + JBEGIN = JBEGIN + 1 + + END IF + + + DO 70 J = JBEGIN, KDU , 2 + + tempu1 = U( J, KMS+1 ) + tempu4 = U( J+1, KMS+1 ) + tempu2 = U( J, KMS+2 ) + tempu5 = U( J+1, KMS+2 ) + tempu3 = U( J, KMS+3 ) + tempu6 = U( J+1, KMS+3 ) + REFSUM = tempv1* ( tempu1+tempv2*tempu2+ + $ tempv3*tempu3 ) + + REFSU1 = tempv1* ( tempu4+tempv2*tempu5+ + $ tempv3*tempu6 ) + + U( J, KMS+1 ) = tempu1 - REFSUM + U( J+1, KMS+1 ) = tempu4 - REFSU1 + U( J, KMS+2 ) = tempu2 - REFSUM*tempv2 + U( J+1, KMS+2 ) = tempu5 - REFSU1*tempv2 + U( J, KMS+3 ) = tempu3 - REFSUM*tempv3 + U( J+1, KMS+3 ) = tempu6 - REFSU1*tempv3 + + 70 CONTINUE + + + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + JBEGIN = ILOZ + + IF ( MOD(IHIZ-ILOZ+1,2).GT.0 ) THEN + J = JBEGIN + + tempz1 = Z( J, K+1 ) + tempz2 = Z( J, K+2 ) + tempz3 = Z( J, K+3 ) + REFSUM = tempv1* ( tempz1+tempv2*tempz2+ + $ tempv3*tempz3 ) + Z( J, K+1 ) = tempz1 - REFSUM + Z( J, K+2 ) = tempz2 - REFSUM*tempv2 + Z( J, K+3 ) = tempz3 - REFSUM*tempv3 + + JBEGIN = JBEGIN + 1 + + END IF + + DO 80 J = JBEGIN, IHIZ, 2 + + tempz1 = Z( J, K+1 ) + tempz4 = Z( J+1, K+1 ) + tempz2 = Z( J, K+2 ) + tempz5 = Z( J+1, K+2 ) + tempz3 = Z( J, K+3 ) + tempz6 = Z( J+1, K+3 ) + + REFSUM = tempv1* ( tempz1+tempv2*tempz2+ + $ tempv3*tempz3 ) + + REFSU1 = tempv1* ( tempz4+tempv2*tempz5+ + $ tempv3*tempz6 ) + + Z( J, K+1 ) = tempz1 - REFSUM + Z( J, K+2 ) = tempz2 - REFSUM*tempv2 + Z( J, K+3 ) = tempz3 - REFSUM*tempv3 + + + Z( J+1, K+1 ) = tempz4 - REFSU1 + Z( J+1, K+2 ) = tempz5 - REFSU1*tempv2 + Z( J+1, K+3 ) = tempz6 - REFSU1*tempv3 + + + 80 CONTINUE + + END IF + END IF + 90 CONTINUE +* +* ==== Special case: 2-by-2 reflection (if needed) ==== +* + K = KRCOL + 3*( M22-1 ) + IF( BMP22 ) THEN + IF ( V( 1, M22 ).NE.ZERO ) THEN + DO 100 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 100 CONTINUE +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 110 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*V( 2, M22 ) + 110 CONTINUE + ELSE IF( WANTZ ) THEN + DO 120 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 120 CONTINUE + END IF + END IF + END IF +* +* ==== Vigilant deflation check ==== +* + MSTART = MTOP + IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) + $ MSTART = MSTART + 1 + MEND = MBOT + IF( BMP22 ) + $ MEND = MEND + 1 + IF( KRCOL.EQ.KBOT-2 ) + $ MEND = MEND + 1 + DO 130 M = MSTART, MEND + K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) + $ THEN + H12 = MAX( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + 130 CONTINUE +* +* ==== Fill in the last row of each bulge. ==== +* + MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) + DO 140 M = MTOP, MEND + K = KRCOL + 3*( M-1 ) + REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) + H( K+4, K+1 ) = -REFSUM + H( K+4, K+2 ) = -REFSUM*V( 2, M ) + H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) + 140 CONTINUE +* +* ==== End of near-the-diagonal bulge chase. ==== +* + 150 CONTINUE +* +* ==== Use U (if accumulated) to update far-from-diagonal +* . entries in H. If required, use U to update Z as +* . well. ==== +* + IF( ACCUM ) THEN + IF( WANTT ) THEN + JTOP = 1 + JBOT = N + ELSE + JTOP = KTOP + JBOT = KBOT + END IF + IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. + $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN +* +* ==== Updates not exploiting the 2-by-2 block +* . structure of U. K1 and NU keep track of +* . the location and size of U in the special +* . cases of introducing bulges and chasing +* . bulges off the bottom. In these special +* . cases and in case the number of shifts +* . is NS = 2, there is no 2-by-2 block +* . structure to exploit. ==== +* + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 160 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 170 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 180 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ Z( JROW, INCOL+K1 ), LDZ ) + 180 CONTINUE + END IF + ELSE +* +* ==== Updates exploiting U's 2-by-2 block structure. +* . (I2, I4, J2, J4 are the last rows and columns +* . of the blocks.) ==== +* + I2 = ( KDU+1 ) / 2 + I4 = KDU + J2 = I4 - I2 + J4 = KDU +* +* ==== KZS and KNZ deal with the band of zeros +* . along the diagonal of one of the triangular +* . blocks. ==== +* + KZS = ( J4-J2 ) - ( NS+1 ) + KNZ = NS + 1 +* +* ==== Horizontal multiply ==== +* + DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) +* +* ==== Copy bottom of H to top+KZS of scratch ==== +* (The first KZS rows get multiplied by zero.) ==== +* + CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), + $ LDH, WH( KZS+1, 1 ), LDWH ) +* +* ==== Multiply by U21**T ==== +* + CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) + CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, + $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), + $ LDWH ) +* +* ==== Multiply top of H by U11**T ==== +* + CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, + $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) +* +* ==== Copy top of H to bottom of WH ==== +* + CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, + $ WH( I2+1, 1 ), LDWH ) +* +* ==== Multiply by U21**T ==== +* + CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, + $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) +* +* ==== Multiply by U22 ==== +* + CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, + $ U( J2+1, I2+1 ), LDU, + $ H( INCOL+1+J2, JCOL ), LDH, ONE, + $ WH( I2+1, 1 ), LDWH ) +* +* ==== Copy it back ==== +* + CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH, + $ H( INCOL+1, JCOL ), LDH ) + 190 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV + JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) +* +* ==== Copy right of H to scratch (the first KZS +* . columns get multiplied by zero) ==== +* + CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), + $ LDH, WV( 1, 1+KZS ), LDWV ) +* +* ==== Multiply by U21 ==== +* + CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) + CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, + $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), + $ LDWV ) +* +* ==== Multiply by U11 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, + $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, + $ LDWV ) +* +* ==== Copy left of H to right of scratch ==== +* + CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, + $ WV( 1, 1+I2 ), LDWV ) +* +* ==== Multiply by U21 ==== +* + CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, + $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) +* +* ==== Multiply by U22 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, + $ H( JROW, INCOL+1+J2 ), LDH, + $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), + $ LDWV ) +* +* ==== Copy it back ==== +* + CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, + $ H( JROW, INCOL+1 ), LDH ) + 200 CONTINUE +* +* ==== Multiply Z (also vertical) ==== +* + IF( WANTZ ) THEN + DO 210 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) +* +* ==== Copy right of Z to left of scratch (first +* . KZS columns get multiplied by zero) ==== +* + CALL DLACPY( 'ALL', JLEN, KNZ, + $ Z( JROW, INCOL+1+J2 ), LDZ, + $ WV( 1, 1+KZS ), LDWV ) +* +* ==== Multiply by U12 ==== +* + CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, + $ LDWV ) + CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, + $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), + $ LDWV ) +* +* ==== Multiply by U11 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, + $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, + $ WV, LDWV ) +* +* ==== Copy left of Z to right of scratch ==== +* + CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), + $ LDZ, WV( 1, 1+I2 ), LDWV ) +* +* ==== Multiply by U21 ==== +* + CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, + $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), + $ LDWV ) +* +* ==== Multiply by U22 ==== +* + CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, + $ Z( JROW, INCOL+1+J2 ), LDZ, + $ U( J2+1, I2+1 ), LDU, ONE, + $ WV( 1, 1+I2 ), LDWV ) +* +* ==== Copy the result back to Z ==== +* + CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, + $ Z( JROW, INCOL+1 ), LDZ ) + 210 CONTINUE + END IF + END IF + END IF + 220 CONTINUE +* +* ==== End of DLAQR5 ==== +* + END From d346c533b1724c6fb2c58388d8a7977eaacbe2d5 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Tue, 7 Jun 2016 16:11:09 +0200 Subject: [PATCH 48/70] Fix z/ctrmv stack allocation on AMD bulldozer and barcelona target * Hopefully, because this was found by error and trial (dark magic) * Ref #786 --- interface/ztrmv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 2be915c32..1721afc1c 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -243,6 +243,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif { buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); + // It seems to be required for some K8 or Barcelona CPU + buffer_size += 8; if(incx != 1) buffer_size += n * 2; } From a94f2b78482aea0fe826e61349bd857e3797f89f Mon Sep 17 00:00:00 2001 From: Daniel Patrick Foose Date: Tue, 14 Jun 2016 14:37:28 -0400 Subject: [PATCH 49/70] Change to allow compiling with USE_OPENMP on MSVC MSVC treats the declaration of omp_in_parallel and omp_get_num_procs without the modifiers __declspec(dllimport) and __cdecl as a redefinition. --- common.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common.h b/common.h index a7342db2c..480174c11 100644 --- a/common.h +++ b/common.h @@ -626,9 +626,14 @@ void gotoblas_profile_init(void); void gotoblas_profile_quit(void); #ifdef USE_OPENMP +#ifndef C_MSVC int omp_in_parallel(void); int omp_get_num_procs(void); #else +__declspec(dllimport) int __cdecl omp_in_parallel(void); +__declspec(dllimport) int __cdecl omp_get_num_procs(void); +#endif +#else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); int omp_get_num_procs(void) __attribute__ ((weak)); From 57df7956ee2bb0b7a56ea0953c6ac72a91fd6f78 Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Tue, 28 Jun 2016 17:51:10 +0530 Subject: [PATCH 50/70] Added CGEMM, ZGEMM, STRMM, DTRMM, CTRMM, ZTRMM. Updated macros in SGEMM, DGEMM, STRMM. Signed-off-by: Shivraj Patil --- kernel/mips/KERNEL.P5600 | 21 +- kernel/mips/cgemm_kernel_8x4_msa.c | 2154 +++++++++++++++++++++++++ kernel/mips/cgemm_ncopy_4_msa.c | 195 +++ kernel/mips/cgemm_ncopy_8_msa.c | 310 ++++ kernel/mips/cgemm_tcopy_4_msa.c | 125 ++ kernel/mips/cgemm_tcopy_8_msa.c | 214 +++ kernel/mips/dgemm_kernel_8x4_msa.c | 850 +++++++--- kernel/mips/dgemm_ncopy_4_msa.c | 49 +- kernel/mips/dgemm_ncopy_8_msa.c | 162 +- kernel/mips/dgemm_tcopy_4_msa.c | 27 +- kernel/mips/dgemm_tcopy_8_msa.c | 123 +- kernel/mips/macros_msa.h | 587 ++++++- kernel/mips/sgemm_kernel_8x8_msa.c | 992 ++++++++++-- kernel/mips/sgemm_ncopy_8_msa.c | 55 +- kernel/mips/sgemm_tcopy_8_msa.c | 65 +- kernel/mips/strsm_kernel_LN_8x8_msa.c | 32 +- kernel/mips/strsm_kernel_LT_8x8_msa.c | 32 +- kernel/mips/strsm_kernel_RN_8x8_msa.c | 78 +- kernel/mips/strsm_kernel_RT_8x8_msa.c | 78 +- kernel/mips/zgemm_kernel_4x4_msa.c | 1589 ++++++++++++++++++ kernel/mips/zgemm_ncopy_4_msa.c | 144 ++ kernel/mips/zgemm_tcopy_4_msa.c | 161 ++ param.h | 18 +- 23 files changed, 7249 insertions(+), 812 deletions(-) create mode 100644 kernel/mips/cgemm_kernel_8x4_msa.c create mode 100644 kernel/mips/cgemm_ncopy_4_msa.c create mode 100644 kernel/mips/cgemm_ncopy_8_msa.c create mode 100644 kernel/mips/cgemm_tcopy_4_msa.c create mode 100644 kernel/mips/cgemm_tcopy_8_msa.c create mode 100644 kernel/mips/zgemm_kernel_4x4_msa.c create mode 100644 kernel/mips/zgemm_ncopy_4_msa.c create mode 100644 kernel/mips/zgemm_tcopy_4_msa.c diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 802f0e0e5..5d8bcb9ec 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -80,11 +80,6 @@ DGEMVTKERNEL = ../mips/gemv_t.c CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -101,15 +96,19 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c new file mode 100644 index 000000000..cd1fa45b3 --- /dev/null +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -0,0 +1,2154 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + LD_SP2_INC(pb0, 4, src_b0, src_b1); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = (OP4 src_a0r) * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = (OP4 src_a1r) * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = (OP4 src_a0r) * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + res5_r OP0## = src_a1r * src_br; \ + res5_r OP1## = src_a1i * src_bi; \ + res5_i OP2## = (OP4 src_a1r) * src_bi; \ + res5_i OP3## = src_a1i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = (OP4 src_a0r) * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ + \ + res7_r OP0## = src_a1r * src_br; \ + res7_r OP1## = src_a1i * src_bi; \ + res7_i OP2## = (OP4 src_a1r) * src_bi; \ + res7_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + src_b0 = LD_SP(pb0); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = (OP4 src_a0r) * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = (OP4 src_a1r) * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ + src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = (OP4 src_a0r) * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = (OP4 src_a1r) * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ +} + +#define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + LD_SP2_INC(pb0, 4, src_b0, src_b1); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + src_b0 = LD_SP(pb0); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_SP2_INC(pa0, 4, src_a0, src_a1); \ + src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ + \ + PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \ +{ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res4 OP0## = a0_r * b1_r; \ + res4 OP1## = a0_i * b1_i; \ + res5 OP2## = OP4 a0_r * b1_i; \ + res5 OP3## = a0_i * b1_r; \ + \ + res6 OP0## = a1_r * b1_r; \ + res6 OP1## = a1_i * b1_i; \ + res7 OP2## = OP4 a1_r * b1_i; \ + res7 OP3## = a1_i * b1_r; \ + \ + /* 2nd col */ \ + b2_r = pb0[4]; \ + b2_i = pb0[5]; \ + res8 OP0## = a0_r * b2_r; \ + res8 OP1## = a0_i * b2_i; \ + res9 OP2## = OP4 a0_r * b2_i; \ + res9 OP3## = a0_i * b2_r; \ + \ + res10 OP0## = a1_r * b2_r; \ + res10 OP1## = a1_i * b2_i; \ + res11 OP2## = OP4 a1_r * b2_i; \ + res11 OP3## = a1_i * b2_r; \ + \ + /* 3rd col */ \ + b3_r = pb0[6]; \ + b3_i = pb0[7]; \ + res12 OP0## = a0_r * b3_r; \ + res12 OP1## = a0_i * b3_i; \ + res13 OP2## = OP4 a0_r * b3_i; \ + res13 OP3## = a0_i * b3_r; \ + \ + res14 OP0## = a1_r * b3_r; \ + res14 OP1## = a1_i * b3_i; \ + res15 OP2## = OP4 a1_r * b3_i; \ + res15 OP3## = a1_i * b3_r; \ +} + +#define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res4 OP0## = a0_r * b1_r; \ + res4 OP1## = a0_i * b1_i; \ + res5 OP2## = OP4 a0_r * b1_i; \ + res5 OP3## = a0_i * b1_r; \ + \ + res6 OP0## = a1_r * b1_r; \ + res6 OP1## = a1_i * b1_i; \ + res7 OP2## = OP4 a1_r * b1_i; \ + res7 OP3## = a1_i * b1_r; \ +} + +#define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + a1_r = pa0[2]; \ + a1_i = pa0[3]; \ + res2 OP0## = a1_r * b0_r; \ + res2 OP1## = a1_i * b0_i; \ + res3 OP2## = OP4 a1_r * b0_i; \ + res3 OP3## = a1_i * b0_r; \ +} + +#define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res2 OP0## = a0_r * b1_r; \ + res2 OP1## = a0_i * b1_i; \ + res3 OP2## = OP4 a0_r * b1_i; \ + res3 OP3## = a0_i * b1_r; \ + \ + /* 2nd col */ \ + b2_r = pb0[4]; \ + b2_i = pb0[5]; \ + res4 OP0## = a0_r * b2_r; \ + res4 OP1## = a0_i * b2_i; \ + res5 OP2## = OP4 a0_r * b2_i; \ + res5 OP3## = a0_i * b2_r; \ + \ + /* 3rd col */ \ + b3_r = pb0[6]; \ + b3_i = pb0[7]; \ + res6 OP0## = a0_r * b3_r; \ + res6 OP1## = a0_i * b3_i; \ + res7 OP2## = OP4 a0_r * b3_i; \ + res7 OP3## = a0_i * b3_r; \ +} + +#define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ + \ + /* 1st col */ \ + b1_r = pb0[2]; \ + b1_i = pb0[3]; \ + res2 OP0## = a0_r * b1_r; \ + res2 OP1## = a0_i * b1_i; \ + res3 OP2## = OP4 a0_r * b1_i; \ + res3 OP3## = a0_i * b1_r; \ +} + +#define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ +} + +#define CGEMM_SCALE_8X4_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ + \ + LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r += alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i += alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ + \ + LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r += alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i += alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ +} + +#define CGEMM_SCALE_8X2_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ +} + +#define CGEMM_SCALE_8X1_MSA \ +{ \ + LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ +} + +#define CGEMM_SCALE_4X4_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + LD_SP2(pc1, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ + \ + LD_SP2(pc2, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc2, 4); \ + \ + LD_SP2(pc3, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc3, 4); \ +} + +#define CGEMM_SCALE_4X2_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + LD_SP2(pc1, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ +} + +#define CGEMM_SCALE_4X1_MSA \ +{ \ + LD_SP2(pc0, 4, dst0, dst1); \ + \ + PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ +} + +#define CGEMM_SCALE_2X4 \ +{ \ + /* 0th col */ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] += alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] += alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] += alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] += alphar * res7; \ + pc1[3] += alphai * res6; \ + \ + /* 2nd col */ \ + pc2[0] += alphar * res8; \ + pc2[0] -= alphai * res9; \ + pc2[1] += alphar * res9; \ + pc2[1] += alphai * res8; \ + pc2[2] += alphar * res10; \ + pc2[2] -= alphai * res11; \ + pc2[3] += alphar * res11; \ + pc2[3] += alphai * res10; \ + \ + /* 3rd col */ \ + pc3[0] += alphar * res12; \ + pc3[0] -= alphai * res13; \ + pc3[1] += alphar * res13; \ + pc3[1] += alphai * res12; \ + pc3[2] += alphar * res14; \ + pc3[2] -= alphai * res15; \ + pc3[3] += alphar * res15; \ + pc3[3] += alphai * res14; \ +} + +#define CGEMM_SCALE_2X2 \ +{ \ + /* 0th col */ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] += alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] += alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] += alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] += alphar * res7; \ + pc1[3] += alphai * res6; \ +} + +#define CGEMM_SCALE_2X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc0[2] += alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] += alphar * res3; \ + pc0[3] += alphai * res2; \ +} + +#define CGEMM_SCALE_1X4 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[0] += alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] += alphar * res3; \ + pc1[1] += alphai * res2; \ + \ + pc2[0] += alphar * res4; \ + pc2[0] -= alphai * res5; \ + pc2[1] += alphar * res5; \ + pc2[1] += alphai * res4; \ + \ + pc3[0] += alphar * res6; \ + pc3[0] -= alphai * res7; \ + pc3[1] += alphar * res7; \ + pc3[1] += alphai * res6; \ +} + +#define CGEMM_SCALE_1X2 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[2] += alphar * res2; \ + pc1[2] -= alphai * res3; \ + pc1[3] += alphar * res3; \ + pc1[3] += alphai * res2; \ +} + +#define CGEMM_SCALE_1X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +#define CGEMM_TRMM_SCALE_8X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r = alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i = alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r = alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i = alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ +} + +#define CGEMM_TRMM_SCALE_8X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ +} + +#define CGEMM_TRMM_SCALE_8X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc2, 4); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc3, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc1, 4); \ +} + +#define CGEMM_TRMM_SCALE_4X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_SP2_INC(dst0, dst1, pc0, 4); \ +} + +#define CGEMM_TRMM_SCALE_2X4 \ +{ \ + /* 0th col */ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] = alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] = alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] = alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] = alphar * res7; \ + pc1[3] += alphai * res6; \ + \ + /* 2nd col */ \ + pc2[0] = alphar * res8; \ + pc2[0] -= alphai * res9; \ + pc2[1] = alphar * res9; \ + pc2[1] += alphai * res8; \ + pc2[2] = alphar * res10; \ + pc2[2] -= alphai * res11; \ + pc2[3] = alphar * res11; \ + pc2[3] += alphai * res10; \ + \ + /* 3rd col */ \ + pc3[0] = alphar * res12; \ + pc3[0] -= alphai * res13; \ + pc3[1] = alphar * res13; \ + pc3[1] += alphai * res12; \ + pc3[2] = alphar * res14; \ + pc3[2] -= alphai * res15; \ + pc3[3] = alphar * res15; \ + pc3[3] += alphai * res14; \ +} + +#define CGEMM_TRMM_SCALE_2X2 \ +{ \ + /* 0th col */ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ + \ + /* 1st col */ \ + pc1[0] = alphar * res4; \ + pc1[0] -= alphai * res5; \ + pc1[1] = alphar * res5; \ + pc1[1] += alphai * res4; \ + pc1[2] = alphar * res6; \ + pc1[2] -= alphai * res7; \ + pc1[3] = alphar * res7; \ + pc1[3] += alphai * res6; \ +} + +#define CGEMM_TRMM_SCALE_2X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc0[2] = alphar * res2; \ + pc0[2] -= alphai * res3; \ + pc0[3] = alphar * res3; \ + pc0[3] += alphai * res2; \ +} + +#define CGEMM_TRMM_SCALE_1X4 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[0] = alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] = alphar * res3; \ + pc1[1] += alphai * res2; \ + \ + pc2[0] = alphar * res4; \ + pc2[0] -= alphai * res5; \ + pc2[1] = alphar * res5; \ + pc2[1] += alphai * res4; \ + \ + pc3[0] = alphar * res6; \ + pc3[0] -= alphai * res7; \ + pc3[1] = alphar * res7; \ + pc3[1] += alphai * res6; \ +} + +#define CGEMM_TRMM_SCALE_1X2 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ + \ + pc1[2] = alphar * res2; \ + pc1[2] -= alphai * res3; \ + pc1[3] = alphar * res3; \ + pc1[3] += alphai * res2; \ +} + +#define CGEMM_TRMM_SCALE_1X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, + FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pa0, *pb0; + FLOAT res0, res1, res2, res3, res4, res5, res6, res7; + FLOAT res8, res9, res10, res11, res12, res13, res14, res15; + FLOAT a0_r, a1_r; + FLOAT a0_i, a1_i; + FLOAT b0_r, b1_r, b2_r, b3_r; + FLOAT b0_i, b1_i, b2_i, b3_i; + v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1; + v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; + v4f32 dst0, dst1, dst2, dst3; + v4f32 alpha_r, alpha_i; + v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; + v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; + v4f32 dst0_r, dst0_i, dst1_r, dst1_i; + + alpha_r = COPY_FLOAT_TO_VECTOR(alphar); + alpha_i = COPY_FLOAT_TO_VECTOR(alphai); + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + pc2 = pc1 + 2 * ldc; + pc3 = pc2 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X4_MSA +#else + CGEMM_SCALE_8X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X4_MSA +#else + CGEMM_SCALE_4X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X4(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X4(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X4(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X4(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 8; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X4(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X4(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X4(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X4(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 8; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X4 +#else + CGEMM_SCALE_2X4 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X4(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X4(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X4(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X4(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 8; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X4(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X4(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X4(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X4(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 8; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X4 +#else + CGEMM_SCALE_1X4 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = k << 3; + B = B + l; + i = ldc << 3; + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X2_MSA(, -, , -, -); +#endif + + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X2_MSA(+, -, -, -,); +#endif + + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X2_MSA +#else + CGEMM_SCALE_8X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X2_MSA(, -, , -, -); +#endif + + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X2_MSA(+, -, -, -,); +#endif + + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X2_MSA +#else + CGEMM_SCALE_4X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X2(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X2(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X2(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X2(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X2(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X2(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X2(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X2(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X2 +#else + CGEMM_SCALE_2X2 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + pc1 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X2(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X2(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X2(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X2(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 4; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X2(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X2(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X2(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X2(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 4; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X2 +#else + CGEMM_SCALE_1X2 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + l = k << 2; + B = B + l; + i = ldc << 2; + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 8; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_8X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_8X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_8X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_8X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_8X1_MSA +#else + CGEMM_SCALE_8X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 8; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif + } + + if (m & 4) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_4X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_4X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_4X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_4X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_4X1_MSA +#else + CGEMM_SCALE_4X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X1(, -, , -, -); +#endif + + pa0 += 4; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_2X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_2X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_2X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_2X1(+, -, -, -,); +#endif + + pa0 += 4; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_2X1 +#else + CGEMM_SCALE_2X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + + pc0 += 4; + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X1(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + CGEMM_KERNEL_1X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + CGEMM_KERNEL_1X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + CGEMM_KERNEL_1X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + CGEMM_KERNEL_1X1(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + CGEMM_TRMM_SCALE_1X1 +#else + CGEMM_SCALE_1X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + + l = k << 1; + B = B + l; + i = ldc << 1; + C = C + i; + } + + return 0; +} diff --git a/kernel/mips/cgemm_ncopy_4_msa.c b/kernel/mips/cgemm_ncopy_4_msa.c new file mode 100644 index 000000000..b38290b3d --- /dev/null +++ b/kernel/mips/cgemm_ncopy_4_msa.c @@ -0,0 +1,195 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 dst0, dst1, dst4, dst5; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + pdst += 8; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + psrc1 += 2; + psrc2 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + pdst += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst); + pdst += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + psrc1 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + pdst += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_ncopy_8_msa.c b/kernel/mips/cgemm_ncopy_8_msa.c new file mode 100644 index 000000000..9ea749069 --- /dev/null +++ b/kernel/mips/cgemm_ncopy_8_msa.c @@ -0,0 +1,310 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07; + FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14; + FLOAT ctemp15, ctemp16; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + ILVRL_D2_SP(src10, src8, dst2, dst6); + ILVRL_D2_SP(src14, src12, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + ILVRL_D2_SP(src11, src9, dst2, dst6); + ILVRL_D2_SP(src15, src13, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + src8 = LD_SP(psrc5); + src10 = LD_SP(psrc6); + src12 = LD_SP(psrc7); + src14 = LD_SP(psrc8); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + psrc5 += 4; + psrc6 += 4; + psrc7 += 4; + psrc8 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + ILVRL_D2_SP(src10, src8, dst2, dst6); + ILVRL_D2_SP(src14, src12, dst3, dst7); + + ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + ctemp09 = *(psrc5 + 0); + ctemp10 = *(psrc5 + 1); + ctemp11 = *(psrc6 + 0); + ctemp12 = *(psrc6 + 1); + ctemp13 = *(psrc7 + 0); + ctemp14 = *(psrc7 + 1); + ctemp15 = *(psrc8 + 0); + ctemp16 = *(psrc8 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + psrc5 += 2; + psrc6 += 2; + psrc7 += 2; + psrc8 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + *(pdst + 8) = ctemp09; + *(pdst + 9) = ctemp10; + *(pdst + 10) = ctemp11; + *(pdst + 11) = ctemp12; + *(pdst + 12) = ctemp13; + *(pdst + 13) = ctemp14; + *(pdst + 14) = ctemp15; + *(pdst + 15) = ctemp16; + pdst += 16; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + ILVRL_D2_SP(src7, src5, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + src4 = LD_SP(psrc3); + src6 = LD_SP(psrc4); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + ILVRL_D2_SP(src6, src4, dst1, dst5); + + ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + ctemp05 = *(psrc3 + 0); + ctemp06 = *(psrc3 + 1); + ctemp07 = *(psrc4 + 0); + ctemp08 = *(psrc4 + 1); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + *(pdst + 4) = ctemp05; + *(pdst + 5) = ctemp06; + *(pdst + 6) = ctemp07; + *(pdst + 7) = ctemp08; + pdst += 8; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + + ILVRL_D2_SP(src3, src1, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src2 = LD_SP(psrc2); + psrc1 += 4; + psrc2 += 4; + + ILVRL_D2_SP(src2, src0, dst0, dst4); + + ST_SP2_INC(dst0, dst4, pdst, 4); + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + psrc1 += 2; + psrc2 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + *(pdst + 2) = ctemp03; + *(pdst + 3) = ctemp04; + pdst += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_SP2_INC(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst, 4); + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + psrc1 += 4; + + ST_SP(src0, pdst); + pdst += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + psrc1 += 2; + + *(pdst + 0) = ctemp01; + *(pdst + 1) = ctemp02; + pdst += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_tcopy_4_msa.c b/kernel/mips/cgemm_tcopy_4_msa.c new file mode 100644 index 000000000..12aaa979e --- /dev/null +++ b/kernel/mips/cgemm_tcopy_4_msa.c @@ -0,0 +1,125 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0; + FLOAT *psrc1, *psrc2; + FLOAT *pdst0; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + v4f32 src0, src1, src2, src3; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 8; + + for (i = (m >> 1); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP2(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst0, 4); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 4; + + for (i = (m >> 1); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + ST_SP2_INC(src0, src1, pdst0, 4); + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + src0 = LD_SP(psrc1); + ST_SP(src0, pdst0); + pdst0 += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 2; + + for (i = (m >> 1); i--;) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + pdst0 += 2; + } + } + + return 0; +} diff --git a/kernel/mips/cgemm_tcopy_8_msa.c b/kernel/mips/cgemm_tcopy_8_msa.c new file mode 100644 index 000000000..9f78fa73a --- /dev/null +++ b/kernel/mips/cgemm_tcopy_8_msa.c @@ -0,0 +1,214 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *pdst0; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 16; + + for (i = (m >> 2); i--;) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + LD_SP4(psrc2, 4, src4, src5, src6, src7); + LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11); + LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15); + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); + ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4); + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + LD_SP4(psrc2, 4, src4, src5, src6, src7); + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP4(psrc1, 4, src0, src1, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 8; + + for (i = (m >> 2); i--;) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + LD_SP2(psrc1 + 2 * lda, 4, src4, src5); + LD_SP2(psrc2 + 2 * lda, 4, src6, src7); + + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + ST_SP4_INC(src4, src5, src6, src7, pdst0, 4); + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + LD_SP2(psrc1, 4, src0, src1); + LD_SP2(psrc2, 4, src2, src3); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + LD_SP2(psrc1, 4, src0, src1); + ST_SP2_INC(src0, src1, pdst0, 4); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 4; + + for (i = (m >> 2); i--;) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + src2 = LD_SP(psrc1 + 2 * lda); + src3 = LD_SP(psrc2 + 2 * lda); + ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); + + psrc1 += 4 * lda; + psrc2 += 4 * lda; + } + + if (m & 2) + { + src0 = LD_SP(psrc1); + src1 = LD_SP(psrc2); + ST_SP2_INC(src0, src1, pdst0, 4); + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + } + + if (m & 1) + { + src0 = LD_SP(psrc1); + ST_SP(src0, pdst0); + pdst0 += 4; + } + } + + if (n & 1) + { + psrc1 = psrc0; + psrc2 = psrc0 + lda; + psrc0 += 2; + + for (i = (m >> 2); i--;) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 2) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + ctemp03 = *(psrc2 + 0); + ctemp04 = *(psrc2 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + *(pdst0 + 2) = ctemp03; + *(pdst0 + 3) = ctemp04; + + psrc1 += 2 * lda; + psrc2 += 2 * lda; + pdst0 += 4; + } + + if (m & 1) + { + ctemp01 = *(psrc1 + 0); + ctemp02 = *(psrc1 + 1); + + *(pdst0 + 0) = ctemp01; + *(pdst0 + 1) = ctemp02; + pdst0 += 2; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c index 1f0a2aee6..9286e7469 100644 --- a/kernel/mips/dgemm_kernel_8x4_msa.c +++ b/kernel/mips/dgemm_kernel_8x4_msa.c @@ -35,19 +35,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, #endif ) { - BLASLONG i, j, l; - FLOAT *pc0, *pc1, *pc2, *pc3; - FLOAT *pa0, *pb0; + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3; - FLOAT a0; - FLOAT b0, b1, b2, b3; + FLOAT a0, b0, b1, b2, b3; v2f64 v_alpha = {alpha, alpha}; v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 res0, res1, res2, res3, res4, res5, res6, res7; v2f64 res8, res9, res10, res11, res12, res13, res14, res15; - for (j = (n / 4); j--;) +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) { pc0 = C; pc1 = pc0 + ldc; @@ -56,12 +61,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 = A; - for (i = (m / 8); i--;) +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -87,13 +114,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 = src_a2 * src_b; res15 = src_a3 * src_b; - pa0 += 8; - pb0 += 4; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -119,11 +143,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 += src_a2 * src_b; res15 += src_a3 * src_b; - pa0 += 8; - pb0 += 4; - - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -148,15 +169,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res13 += src_a1 * src_b; res14 += src_a2 * src_b; res15 += src_a3 * src_b; - - pa0 += 8; - pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -181,11 +199,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res13 += src_a1 * src_b; res14 += src_a2 * src_b; res15 += src_a3 * src_b; - - pa0 += 8; - pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); @@ -197,10 +222,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; - - ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); - ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); - +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); + +#if defined(TRMMKERNEL) + dst0 = res8 * v_alpha; + dst1 = res9 * v_alpha; + dst2 = res10 * v_alpha; + dst3 = res11 * v_alpha; + dst4 = res12 * v_alpha; + dst5 = res13 * v_alpha; + dst6 = res14 * v_alpha; + dst7 = res15 * v_alpha; +#else LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); @@ -212,22 +247,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; +#endif - ST_DP4(dst0, dst1, dst2, dst3, pc2, 2); - ST_DP4(dst4, dst5, dst6, dst7, pc3, 2); + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); - pc0 += 8; - pc1 += 8; - pc2 += 8; - pc3 += 8; +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 4; +#endif - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -245,13 +311,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 = src_a0 * src_b; res7 = src_a1 * src_b; - pa0 += 4; - pb0 += 4; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -269,11 +332,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 4; - pb0 += 4; - - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -290,15 +350,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 += src_a0 * src_b; res7 += src_a1 * src_b; - - pa0 += 4; - pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP2(pa0, 2, src_a0, src_a1); - LD_DP2(pb0, 2, src_b0, src_b1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -315,11 +372,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 += src_a0 * src_b; res7 += src_a1 * src_b; - - pa0 += 4; - pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); LD_DP2(pc2, 2, dst4, dst5); @@ -333,24 +397,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + ST_DP2_INC(dst2, dst3, pc1, 2); + ST_DP2_INC(dst4, dst5, pc2, 2); + ST_DP2_INC(dst6, dst7, pc3, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 4; +#endif - ST_DP2(dst0, dst1, pc0, 2); - ST_DP2(dst2, dst3, pc1, 2); - ST_DP2(dst4, dst5, pc2, 2); - ST_DP2(dst6, dst7, pc3, 2); - - pc0 += 4; - pc1 += 4; - pc2 += 4; - pc3 += 4; +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -364,13 +459,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 = src_a0 * src_b; - pa0 += 2; - pb0 += 4; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -384,11 +477,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; - pa0 += 2; - pb0 += 4; - src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -401,15 +492,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; - - pa0 += 2; - pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); - LD_DP2(pb0, 2, src_b0, src_b1); + pa0 += 2; + LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -422,11 +511,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; - - pa0 += 2; - pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst2 = LD_DP(pc2); @@ -436,21 +528,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); ST_DP(dst2, pc2); ST_DP(dst3, pc3); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -468,7 +594,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -503,7 +629,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -527,10 +653,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; +#else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; @@ -538,25 +688,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc3 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + l = (k << 2); B = B + l; i = (ldc << 2); C = C + i; } - for (j = ((n & 2) / 2); j--;) + if (n & 2) { pc0 = C; pc1 = pc0 + ldc; pa0 = A; - for (i = (m / 8); i--;) +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif + - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -570,13 +748,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 = src_a2 * src_b; res7 = src_a3 * src_b; - pa0 += 8; - pb0 += 2; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -590,11 +766,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a2 * src_b; res7 += src_a3 * src_b; - pa0 += 8; - pb0 += 2; - - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -607,15 +781,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; - - pa0 += 8; - pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -628,11 +800,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; - - pa0 += 8; - pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); @@ -644,20 +823,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 2; +#endif - ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); - ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); - - pc0 += 8; - pc1 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -667,13 +879,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 = src_a0 * src_b; res3 = src_a1 * src_b; - pa0 += 4; - pb0 += 2; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -683,11 +893,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 4; - pb0 += 2; - - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -696,15 +904,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; - - pa0 += 4; - pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -713,11 +919,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; - - pa0 += 4; - pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); @@ -725,20 +934,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + ST_DP2_INC(dst2, dst3, pc1, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 2; +#endif - ST_DP2(dst0, dst1, pc0, 2); - ST_DP2(dst2, dst3, pc1, 2); - - pc0 += 4; - pc1 += 4; +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; @@ -746,13 +989,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 = src_a0 * src_b; - pa0 += 2; - pb0 += 2; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; @@ -760,53 +1002,86 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; - pa0 += 2; - pb0 += 2; - src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; - - pa0 += 2; - pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); + pa0 += 2; src_b0 = LD_DP(pb0); + pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; - - pa0 += 2; - pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; - +#endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -818,7 +1093,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -841,7 +1116,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -857,29 +1132,77 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; +#else pc0[0] += tmp0; pc1[0] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + l = (k << 1); B = B + l; i = (ldc << 1); C = C + i; } - for (j = (n & 1); j--;) + if (n & 1) { pc0 = C; pa0 = A; - for (i = (m / 8); i--;) +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 1; +#endif - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -888,12 +1211,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 = src_a2 * src_b; res3 = src_a3 * src_b; - pa0 += 8; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -902,10 +1224,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a2 * src_b; res3 += src_a3 * src_b; - pa0 += 8; pb0 += 1; - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -914,13 +1235,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a2 * src_b; res3 += src_a3 * src_b; - pa0 += 8; pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; @@ -929,85 +1249,156 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a2 * src_b; res3 += src_a3 * src_b; - pa0 += 8; pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; +#endif + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 1; +#endif - ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); - - pc0 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 = src_a0 * src_b; res1 = src_a1 * src_b; - pa0 += 4; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 4; pb0 += 1; - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 4; pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 4; pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; +#endif + ST_DP2_INC(dst0, dst1, pc0, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 1; +#endif - ST_DP2(dst0, dst1, pc0, 2); - - pc0 += 4; +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1018,7 +1409,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1039,7 +1430,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1051,18 +1442,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; +#else dst0 = LD_DP(pc0); dst0 += res0 * v_alpha; - +#endif ST_DP(dst0, pc0); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1071,7 +1499,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1088,7 +1516,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1098,15 +1526,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + pc0[0] = alpha * tmp0; +#else pc0[0] += alpha * tmp0; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + l = (k << 0); B = B + l; i = (ldc << 0); C = C + i; } + return 0; } diff --git a/kernel/mips/dgemm_ncopy_4_msa.c b/kernel/mips/dgemm_ncopy_4_msa.c index bbd76070f..a61b2e806 100644 --- a/kernel/mips/dgemm_ncopy_4_msa.c +++ b/kernel/mips/dgemm_ncopy_4_msa.c @@ -32,8 +32,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; - FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *pdst; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -50,28 +49,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; - - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); - - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - - ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); - pdst += 16; + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src3, src1, dst2, dst6); + ILVRL_D2_DP(src7, src5, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); } for (i = (m & 3); i--;) @@ -91,18 +79,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - psrc1 += 4; - psrc2 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src3, src1, dst1, dst5); - ST_DP4(dst0, dst4, dst1, dst5, pdst, 2); - pdst += 8; + ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2); } for (i = (m & 3); i--;) diff --git a/kernel/mips/dgemm_ncopy_8_msa.c b/kernel/mips/dgemm_ncopy_8_msa.c index 43c977582..86d019c4f 100644 --- a/kernel/mips/dgemm_ncopy_8_msa.c +++ b/kernel/mips/dgemm_ncopy_8_msa.c @@ -32,9 +32,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; - FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *psrc5, *psrc6, *psrc7, *psrc8; - FLOAT *pdst; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -56,80 +55,51 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 3); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - LD_DP2(psrc5, 2, src8, src9); - LD_DP2(psrc6, 2, src10, src11); - LD_DP2(psrc7, 2, src12, src13); - LD_DP2(psrc8, 2, src14, src15); - - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); - - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); - - dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); - - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16, - 2); - - LD_DP2(psrc1 + 4, 2, src0, src1); - LD_DP2(psrc2 + 4, 2, src2, src3); - LD_DP2(psrc3 + 4, 2, src4, src5); - LD_DP2(psrc4 + 4, 2, src6, src7); - LD_DP2(psrc5 + 4, 2, src8, src9); - LD_DP2(psrc6 + 4, 2, src10, src11); - LD_DP2(psrc7 + 4, 2, src12, src13); - LD_DP2(psrc8 + 4, 2, src14, src15); - - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); - - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32, - 2); - - dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); - dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); - - ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48, - 2); - - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; - pdst += 64; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src10, src8, dst2, dst6); + ILVRL_D2_DP(src14, src12, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + ILVRL_D2_DP(src3, src1, dst0, dst4); + ILVRL_D2_DP(src7, src5, dst1, dst5); + ILVRL_D2_DP(src11, src9, dst2, dst6); + ILVRL_D2_DP(src15, src13, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src10, src8, dst2, dst6); + ILVRL_D2_DP(src14, src12, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + ILVRL_D2_DP(src3, src1, dst0, dst4); + ILVRL_D2_DP(src7, src5, dst1, dst5); + ILVRL_D2_DP(src11, src9, dst2, dst6); + ILVRL_D2_DP(src15, src13, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); } for (i = (m & 7); i--;) @@ -155,27 +125,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; - - dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); - dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); - dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); - dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); - - dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); - dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); - dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); - dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); - - ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); - pdst += 16; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ILVRL_D2_DP(src2, src0, dst0, dst4); + ILVRL_D2_DP(src6, src4, dst1, dst5); + ILVRL_D2_DP(src3, src1, dst2, dst6); + ILVRL_D2_DP(src7, src5, dst3, dst7); + + ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); } for (i = (m & 3); i--;) @@ -200,11 +160,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 2; psrc2 += 2; - dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0); - dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0); + ILVRL_D2_DP(src1, src0, dst0, dst1); - ST_DP2(dst0, dst1, pdst, 2); - pdst += 4; + ST_DP2_INC(dst0, dst1, pdst, 2); } if (m & 1) diff --git a/kernel/mips/dgemm_tcopy_4_msa.c b/kernel/mips/dgemm_tcopy_4_msa.c index f147d190e..a51c47429 100644 --- a/kernel/mips/dgemm_tcopy_4_msa.c +++ b/kernel/mips/dgemm_tcopy_4_msa.c @@ -55,14 +55,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); pdst1 += m * 4; @@ -79,8 +75,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc3 += 2; psrc4 += 2; - ST_DP4(src0, src1, src2, src3, pdst2, 2); - pdst2 += 8; + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); } if (n & 1) @@ -103,10 +98,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - psrc1 += 4; - psrc2 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); ST_DP4(src0, src1, src2, src3, pdst1, 2); pdst1 += m * 4; @@ -119,8 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 2; psrc2 += 2; - ST_DP2(src0, src1, pdst2, 2); - pdst2 += 4; + ST_DP2_INC(src0, src1, pdst2, 2); } if (n & 1) @@ -137,8 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 2); i--;) { - LD_DP2(psrc1, 2, src0, src1); - psrc1 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); ST_DP2(src0, src1, pdst1, 2); pdst1 += 4 * m; diff --git a/kernel/mips/dgemm_tcopy_8_msa.c b/kernel/mips/dgemm_tcopy_8_msa.c index d1ac49b5a..350ecb359 100644 --- a/kernel/mips/dgemm_tcopy_8_msa.c +++ b/kernel/mips/dgemm_tcopy_8_msa.c @@ -62,27 +62,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - LD_DP4(psrc2, 2, src4, src5, src6, src7); - LD_DP4(psrc3, 2, src8, src9, src10, src11); - LD_DP4(psrc4, 2, src12, src13, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, pdst1 + 16, 2); - LD_DP4(psrc5, 2, src0, src1, src2, src3); - LD_DP4(psrc6, 2, src4, src5, src6, src7); - LD_DP4(psrc7, 2, src8, src9, src10, src11); - LD_DP4(psrc8, 2, src12, src13, src14, src15); - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_DP4_INC(psrc5, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc6, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc7, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc8, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, 2); @@ -93,27 +85,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - LD_DP2(psrc5, 2, src8, src9); - LD_DP2(psrc6, 2, src10, src11); - LD_DP2(psrc7, 2, src12, src13); - LD_DP2(psrc8, 2, src14, src15); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; - psrc5 += 4; - psrc6 += 4; - psrc7 += 4; - psrc8 += 4; - - ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); - ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, - pdst2 + 16, 2); - pdst2 += 32; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + LD_DP2_INC(psrc5, 2, src8, src9); + LD_DP2_INC(psrc6, 2, src10, src11); + LD_DP2_INC(psrc7, 2, src12, src13); + LD_DP2_INC(psrc8, 2, src14, src15); + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, + pdst2, 2); } if (n & 2) @@ -135,8 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc7 += 2; psrc8 += 2; - ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); - pdst3 += 16; + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); } if (n & 1) @@ -165,18 +147,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - LD_DP4(psrc2, 2, src4, src5, src6, src7); - LD_DP4(psrc3, 2, src8, src9, src10, src11); - LD_DP4(psrc4, 2, src12, src13, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, @@ -186,17 +160,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - LD_DP2(psrc3, 2, src4, src5); - LD_DP2(psrc4, 2, src6, src7); - psrc1 += 4; - psrc2 += 4; - psrc3 += 4; - psrc4 += 4; - - ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); - pdst2 += 16; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); } if (n & 2) @@ -210,8 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc3 += 2; psrc4 += 2; - ST_DP4(src0, src1, src2, src3, pdst3, 2); - pdst3 += 8; + ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); } if (n & 1) @@ -234,10 +202,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - LD_DP4(psrc2, 2, src4, src5, src6, src7); - psrc1 += 8; - psrc2 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); pdst1 += 8 * m; @@ -245,13 +211,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - LD_DP2(psrc2, 2, src2, src3); - psrc1 += 4; - psrc2 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); - ST_DP4(src0, src1, src2, src3, pdst2, 2); - pdst2 += 8; + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); } if (n & 2) @@ -261,8 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 2; psrc2 += 2; - ST_DP2(src0, src1, pdst3, 2); - pdst3 += 4; + ST_DP2_INC(src0, src1, pdst3, 2); } if (n & 1) @@ -282,8 +244,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_DP4(psrc1, 2, src0, src1, src2, src3); - psrc1 += 8; + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); ST_DP4(src0, src1, src2, src3, pdst1, 2); pdst1 += 8 * m; @@ -291,11 +252,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, if (n & 4) { - LD_DP2(psrc1, 2, src0, src1); - psrc1 += 4; + LD_DP2_INC(psrc1, 2, src0, src1); - ST_DP2(src0, src1, pdst2, 2); - pdst2 += 4; + ST_DP2_INC(src0, src1, pdst2, 2); } if (n & 2) diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index 0efca7860..dbc185302 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -42,10 +42,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) -#define COPY_FLOAT_TO_VECTOR(a, b) \ - b = __msa_cast_to_vector_float(a); \ - b = (v4f32) __msa_splati_w((v4i32) b, 0); +#define COPY_FLOAT_TO_VECTOR(a) ( { \ + v4f32 out; \ + out = __msa_cast_to_vector_float(a); \ + out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + out; \ +} ) +#define COPY_DOUBLE_TO_VECTOR(a) ( { \ + v2f64 out; \ + out = __msa_cast_to_vector_double(a); \ + out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + out; \ +} ) + +/* Description : Load 2 variables with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 +*/ +#define LD_GP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = *(psrc); \ + (psrc) += stride; \ + out1 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + out2 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_GP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + out4 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + LD_GP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_GP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_GP2_INC(psrc, stride, out0, out1); \ + LD_GP2_INC(psrc, stride, out2, out3); \ + LD_GP2_INC(psrc, stride, out4, out5); \ + out6 = *(psrc); \ + (psrc) += stride; \ +} + +#define LD_GP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \ +} /* Description : Load 2 vectors of single precision floating point elements with stride Arguments : Inputs - psrc, stride @@ -58,6 +130,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out1 = LD_SP((psrc) + stride); \ } +#define LD_SP4(psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_SP2(psrc, stride, out0, out1) \ + LD_SP2(psrc + 2 * stride, stride, out2, out3) \ +} + +#define LD_SP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = LD_SP((psrc)); \ + (psrc) += stride; \ + out1 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + out2 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_SP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + out4 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + LD_SP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_SP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_SP2_INC(psrc, stride, out0, out1); \ + LD_SP2_INC(psrc, stride, out2, out3); \ + LD_SP2_INC(psrc, stride, out4, out5); \ + out6 = LD_SP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_SP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \ +} + +#define LD_SP16_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7, out8, \ + out9, out10, out11, out12, out13, \ + out14, out15) \ +{ \ + LD_SP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7); \ + LD_SP8_INC(psrc, stride, out8, out9, out10, \ + out11, out12, out13, out14, out15); \ +} + /* Description : Load 2 vectors of double precision floating point elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -75,6 +223,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LD_DP2(psrc + 2 * stride, stride, out2, out3) \ } +#define LD_DP2_INC(psrc, stride, out0, out1) \ +{ \ + out0 = LD_DP(psrc); \ + (psrc) += stride; \ + out1 = LD_DP(psrc); \ + (psrc) += stride; \ +} + +#define LD_DP3_INC(psrc, stride, out0, \ + out1, out2) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + out2 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP4_INC(psrc, stride, out0, \ + out1, out2, out3) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ +} + +#define LD_DP5_INC(psrc, stride, out0, \ + out1, out2, out3, out4) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + out4 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP6_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + LD_DP2_INC(psrc, stride, out4, out5); \ +} + +#define LD_DP7_INC(psrc, stride, out0, \ + out1, out2, out3, \ + out4, out5, out6) \ +{ \ + LD_DP2_INC(psrc, stride, out0, out1); \ + LD_DP2_INC(psrc, stride, out2, out3); \ + LD_DP2_INC(psrc, stride, out4, out5); \ + out6 = LD_DP((psrc)); \ + (psrc) += stride; \ +} + +#define LD_DP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ +{ \ + LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \ + LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \ +} + +#define LD_DP16_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7, out8, \ + out9, out10, out11, out12, out13, \ + out14, out15) \ +{ \ + LD_DP8_INC(psrc, stride, out0, out1, out2, \ + out3, out4, out5, out6, out7); \ + LD_DP8_INC(psrc, stride, out8, out9, out10, \ + out11, out12, out13, out14, out15); \ +} + +/* Description : Store GP variable with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 single precision floating point elements from 'in0' to (pdst) + Store 4 single precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_GP2_INC(in0, in1, \ + pdst, stride) \ +{ \ + *(pdst) = in0; \ + (pdst) += stride; \ + *(pdst) = in1; \ + (pdst) += stride; \ +} + +#define ST_GP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + *(pdst) = in2; \ + (pdst) += stride; \ +} + +#define ST_GP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_GP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + *(pdst) = in4; \ + (pdst) += stride; \ +} + +#define ST_GP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + ST_GP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_GP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_GP2_INC(in0, in1, pdst, stride); \ + ST_GP2_INC(in2, in3, pdst, stride); \ + ST_GP2_INC(in4, in5, pdst, stride); \ + *(pdst) = in6; \ + (pdst) += stride; \ +} + +#define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + /* Description : Store vectors of single precision floating point elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 4 single precision floating point elements from 'in0' to (pdst) @@ -98,6 +379,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \ } +#define ST_SP2_INC(in0, in1, pdst, stride) \ +{ \ + ST_SP(in0, (pdst)); \ + (pdst) += stride; \ + ST_SP(in1, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP(in2, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_SP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP(in4, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_SP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_SP2_INC(in0, in1, pdst, stride); \ + ST_SP2_INC(in2, in3, pdst, stride); \ + ST_SP2_INC(in4, in5, pdst, stride); \ + ST_SP(in6, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + +#define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, in8, in9, in10, in11, in12, \ + in13, in14, in15, pdst, stride) \ +{ \ + ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, pdst, stride); \ + ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \ + in15, pdst, stride); \ +} + /* Description : Store vectors of double precision floating point elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 2 double precision floating point elements from 'in0' to (pdst) @@ -121,6 +469,104 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } +#define ST_DP2_INC(in0, in1, pdst, stride) \ +{ \ + ST_DP(in0, (pdst)); \ + (pdst) += stride; \ + ST_DP(in1, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP3_INC(in0, in1, in2, \ + pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP(in2, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP4_INC(in0, in1, in2, in3, \ + pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ +} + +#define ST_DP5_INC(in0, in1, in2, in3, \ + in4, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP(in4, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP6_INC(in0, in1, in2, in3, \ + in4, in5, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP2_INC(in4, in5, pdst, stride); \ +} + +#define ST_DP7_INC(in0, in1, in2, in3, in4, \ + in5, in6, pdst, stride) \ +{ \ + ST_DP2_INC(in0, in1, pdst, stride); \ + ST_DP2_INC(in2, in3, pdst, stride); \ + ST_DP2_INC(in4, in5, pdst, stride); \ + ST_DP(in6, (pdst)); \ + (pdst) += stride; \ +} + +#define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \ + in6, in7, pdst, stride) \ +{ \ + ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \ + ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \ +} + +#define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, in8, in9, in10, in11, in12, \ + in13, in14, in15, pdst, stride) \ +{ \ + ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \ + in7, pdst, stride); \ + ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \ + in15, pdst, stride); \ +} + +/* Description : shuffle elements in vector as shf_val + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE +*/ +#define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \ +{ \ + out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ + out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ +} +#define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__) +#define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__) + +#define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \ + shf_val) \ +{ \ + out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ + out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ + out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \ +} +#define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__) + +#define SHF_W4(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3, shf_val) \ +{ \ + SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \ + SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \ +} +#define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__) +#define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__) + /* Description : Interleave both left and right half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 @@ -134,12 +580,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ } #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) +#define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__) #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ } +#define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__) #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) /* Description : Indexed word element values are replicated to all @@ -158,6 +606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ } +#define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__) #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ { \ @@ -166,22 +615,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) +#define SPLATI_D2(RTYPE, in, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \ + out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \ +} +#define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' are copied to the left half + of 'out0' & even double word elements of 'in1' are copied to + the right half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ +} +#define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__) +#define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__) + +#define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ + out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \ +} +#define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__) + +/* Description : pack both even and odd half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are copied to the + 'out0' & odd double word elements of 'in0' and 'in1' are + copied to the 'out1'. +*/ +#define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \ +} +#define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__) + +#define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ +} +#define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} +#define MUL3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + out2 = in4 * in5; \ +} +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Addition of 2 pairs of variables + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD3(in0, in1, in2, in3, in4, in5, \ + out0, out1, out2) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + out2 = in4 + in5; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + /* Description : Transpose 4x4 block with word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - as per RTYPE */ -#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ -{ \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ - ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ - \ - out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \ - out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \ - out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ - out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ +#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3) \ +{ \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \ + ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \ } #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) diff --git a/kernel/mips/sgemm_kernel_8x8_msa.c b/kernel/mips/sgemm_kernel_8x8_msa.c index 611ebabac..1695471ad 100644 --- a/kernel/mips/sgemm_kernel_8x8_msa.c +++ b/kernel/mips/sgemm_kernel_8x8_msa.c @@ -35,20 +35,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, #endif ) { - BLASLONG i, j, l; + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; FLOAT *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; - FLOAT a0, a1; - FLOAT b0, b1, b2, b3, b4, b5, b6, b7; + FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7; v4f32 v_alpha = {alpha, alpha, alpha, alpha}; v4f32 src_a0, src_a1, src_b, src_b0, src_b1; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v4f32 res0, res1, res2, res3, res4, res5, res6, res7; v4f32 res8, res9, res10, res11, res12, res13, res14, res15; - for (j = (n / 8); j--;) +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 3); j--;) { pc0 = C; pc1 = pc0 + ldc; @@ -59,13 +65,35 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc6 = pc5 + ldc; pc7 = pc6 + ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + pa0 = A; - for (i = (m / 8); i--;) + for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 8; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; @@ -99,13 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 = src_a0 * src_b; res15 = src_a1 * src_b; - pa0 += 8; - pb0 += 8; - - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -139,11 +164,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res14 += src_a0 * src_b; res15 += src_a1 * src_b; - pa0 += 8; - pb0 += 8; - - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -176,15 +198,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; - - pa0 += 8; - pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -217,11 +236,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; - - pa0 += 8; - pb0 += 8; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); @@ -235,12 +261,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; - - ST_SP2(dst0, dst1, pc0, 4); - ST_SP2(dst2, dst3, pc1, 4); - ST_SP2(dst4, dst5, pc2, 4); - ST_SP2(dst6, dst7, pc3, 4); - +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + ST_SP2_INC(dst4, dst5, pc2, 4); + ST_SP2_INC(dst6, dst7, pc3, 4); + +#if defined(TRMMKERNEL) + dst0 = res8 * v_alpha; + dst1 = res9 * v_alpha; + dst2 = res10 * v_alpha; + dst3 = res11 * v_alpha; + dst4 = res12 * v_alpha; + dst5 = res13 * v_alpha; + dst6 = res14 * v_alpha; + dst7 = res15 * v_alpha; +#else LD_SP2(pc4, 4, dst0, dst1); LD_SP2(pc5, 4, dst2, dst3); LD_SP2(pc6, 4, dst4, dst5); @@ -254,28 +290,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc4, 4); + ST_SP2_INC(dst2, dst3, pc5, 4); + ST_SP2_INC(dst4, dst5, pc6, 4); + ST_SP2_INC(dst6, dst7, pc7, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 8; +#endif - ST_SP2(dst0, dst1, pc4, 4); - ST_SP2(dst2, dst3, pc5, 4); - ST_SP2(dst4, dst5, pc6, 4); - ST_SP2(dst6, dst7, pc7, 4); - - pc0 += 8; - pc1 += 8; - pc2 += 8; - pc3 += 8; - pc4 += 8; - pc5 += 8; - pc6 += 8; - pc7 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; @@ -302,12 +364,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 = src_a0 * src_b; pa0 += 4; - pb0 += 8; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -334,10 +395,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 += src_a0 * src_b; pa0 += 4; - pb0 += 8; src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -364,13 +424,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 += src_a0 * src_b; pa0 += 4; - pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); - LD_SP2(pb0, 4, src_b0, src_b1); + LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; @@ -397,9 +456,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res7 += src_a0 * src_b; pa0 += 4; - pb0 += 8; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); @@ -409,12 +473,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); +#if defined(TRMMKERNEL) + dst0 = res4 * v_alpha; + dst1 = res5 * v_alpha; + dst2 = res6 * v_alpha; + dst3 = res7 * v_alpha; +#else dst0 = LD_SP(pc4); dst1 = LD_SP(pc5); dst2 = LD_SP(pc6); @@ -424,12 +494,29 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res5 * v_alpha; dst2 += res6 * v_alpha; dst3 += res7 * v_alpha; - +#endif ST_SP(dst0, pc4); ST_SP(dst1, pc5); ST_SP(dst2, pc6); ST_SP(dst3, pc7); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + pc0 += 4; pc1 += 4; pc2 += 4; @@ -440,9 +527,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc7 += 4; } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -482,7 +587,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 8; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -561,7 +666,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -611,6 +716,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp12 = alpha * tmp12; tmp14 = alpha * tmp14; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc2[0] = tmp4; + pc3[0] = tmp6; + pc4[0] = tmp8; + pc5[0] = tmp10; + pc6[0] = tmp12; + pc7[0] = tmp14; +#else pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; @@ -619,7 +734,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc5[0] += tmp10; pc6[0] += tmp12; pc7[0] += tmp14; - +#endif tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; @@ -629,6 +744,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp13 = alpha * tmp13; tmp15 = alpha * tmp15; +#if defined(TRMMKERNEL) + pc0[1] = tmp1; + pc1[1] = tmp3; + pc2[1] = tmp5; + pc3[1] = tmp7; + pc4[1] = tmp9; + pc5[1] = tmp11; + pc6[1] = tmp13; + pc7[1] = tmp15; +#else pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; @@ -637,6 +762,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc5[1] += tmp11; pc6[1] += tmp13; pc7[1] += tmp15; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; @@ -648,9 +791,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc7 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 8; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 8; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -680,7 +841,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 8; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -739,14 +900,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 8; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; - tmp1 += a0 * b1; + tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; @@ -779,6 +940,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp6 = alpha * tmp6; tmp7 = alpha * tmp7; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; + pc4[0] = tmp4; + pc5[0] = tmp5; + pc6[0] = tmp6; + pc7[0] = tmp7; +#else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; @@ -787,7 +958,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc5[0] += tmp5; pc6[0] += tmp6; pc7[0] += tmp7; +#endif +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 8; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 8; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; pc2 += 1; @@ -798,13 +986,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc7 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + l = (k << 3); B = B + l; i = (ldc << 3); C = C + i; } - for (j = ((n & 4) / 4); j--;) + if (n & 4) { pc0 = C; pc1 = pc0 + ldc; @@ -813,11 +1005,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 = A; - for (i = (m / 8); i--;) +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -836,12 +1050,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 = src_a0 * src_b; res7 = src_a1 * src_b; - pa0 += 8; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -860,10 +1073,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 8; pb0 += 4; - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -882,13 +1094,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 8; pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); @@ -907,10 +1118,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res6 += src_a0 * src_b; res7 += src_a1 * src_b; - pa0 += 8; pb0 += 4; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); @@ -924,21 +1144,52 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; +#endif - ST_SP2(dst0, dst1, pc0, 4); - ST_SP2(dst2, dst3, pc1, 4); - ST_SP2(dst4, dst5, pc2, 4); - ST_SP2(dst6, dst7, pc3, 4); + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + ST_SP2_INC(dst4, dst5, pc2, 4); + ST_SP2_INC(dst6, dst7, pc3, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif - pc0 += 8; - pc1 += 8; - pc2 += 8; - pc3 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); @@ -958,7 +1209,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); @@ -997,7 +1248,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); @@ -1017,7 +1268,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 4; } - +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); @@ -1027,21 +1283,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif pc0 += 4; pc1 += 4; pc2 += 4; pc3 += 4; } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1065,7 +1355,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1112,7 +1402,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1142,20 +1432,50 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp4 = alpha * tmp4; tmp6 = alpha * tmp6; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc2[0] = tmp4; + pc3[0] = tmp6; +#else pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; pc3[0] += tmp6; - +#endif tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; tmp7 = alpha * tmp7; +#if defined(TRMMKERNEL) + pc0[1] = tmp1; + pc1[1] = tmp3; + pc2[1] = tmp5; + pc3[1] = tmp7; +#else pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; pc3[1] += tmp7; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; @@ -1163,9 +1483,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc3 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1183,7 +1521,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 4; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1218,7 +1556,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1242,35 +1580,84 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; +#else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 4; +#endif +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + l = (k << 2); B = B + l; i = (ldc << 2); C = C + i; } - for (j = ((n & 2) / 2); j--;) + if (n & 2) { pc0 = C; pc1 = pc0 + ldc; pa0 = A; - for (i = (m / 8); i--;) +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1282,12 +1669,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 = src_a0 * src_b; res3 = src_a1 * src_b; - pa0 += 8; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1299,10 +1685,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 8; pb0 += 2; - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1314,13 +1699,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 8; pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; @@ -1332,10 +1716,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, res2 += src_a0 * src_b; res3 += src_a1 * src_b; - pa0 += 8; pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); @@ -1343,17 +1732,49 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + ST_SP2_INC(dst2, dst3, pc1, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 2; +#endif - ST_SP2(dst0, dst1, pc0, 4); - ST_SP2(dst2, dst3, pc1, 4); - - pc0 += 8; - pc1 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1368,7 +1789,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1397,7 +1818,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1413,22 +1834,60 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; - +#endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif pc0 += 4; pc1 += 4; } - for (i = ((m & 2) / 2); i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1444,7 +1903,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1475,7 +1934,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1493,24 +1952,64 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, } tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp2; + pc0[1] = tmp1; + pc1[1] = tmp3; +#else pc0[0] += tmp0; pc1[0] += tmp2; - - tmp1 = alpha * tmp1; - tmp3 = alpha * tmp3; - pc0[1] += tmp1; pc1[1] += tmp3; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; pc1 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1522,7 +2021,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 2; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1545,7 +2044,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1561,87 +2060,166 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; +#else pc0[0] += tmp0; pc1[0] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 2; +#endif +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; pc1 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif l = (k << 1); B = B + l; i = (ldc << 1); C = C + i; } - for (j = (n & 1); j--;) + if (n & 1) { pc0 = C; pa0 = A; - for (i = (m / 8); i--;) +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 1; +#endif - LD_SP2(pa0, 4, src_a0, src_a1); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; - pa0 += 8; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 8; pb0 += 1; - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 8; pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { - LD_SP2(pa0, 4, src_a0, src_a1); + LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; - pa0 += 8; pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else LD_SP2(pc0, 4, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; +#endif + ST_SP2_INC(dst0, dst1, pc0, 4); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 1; +#endif - ST_SP2(dst0, dst1, pc0, 4); - - pc0 += 8; +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif } - for (i = ((m & 4) / 4); i--;) + if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1652,7 +2230,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1673,7 +2251,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; @@ -1685,18 +2263,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; +#else dst0 = LD_SP(pc0); dst0 += res0 * v_alpha; - +#endif ST_SP(dst0, pc0); +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif pc0 += 4; } - for (i = (m & 2) / 2; i--;) + if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1708,7 +2323,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 2; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1731,7 +2346,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1744,18 +2359,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - tmp0 = alpha * tmp0; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc0[1] = tmp1; +#else pc0[0] += tmp0; - - tmp1 = alpha * tmp1; pc0[1] += tmp1; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif pc0 += 2; } - for (i = (m & 1); i--;) + if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1764,7 +2416,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 1; pb0 += 1; - for (l = ((k - 1) / 2); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1781,7 +2433,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1791,11 +2443,35 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } +#if defined(TRMMKERNEL) + pc0[0] = alpha * tmp0; +#else pc0[0] += alpha * tmp0; +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif pc0 += 1; } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif l = (k << 0); B = B + l; i = (ldc << 0); diff --git a/kernel/mips/sgemm_ncopy_8_msa.c b/kernel/mips/sgemm_ncopy_8_msa.c index 71048f1c3..8618c4435 100644 --- a/kernel/mips/sgemm_ncopy_8_msa.c +++ b/kernel/mips/sgemm_ncopy_8_msa.c @@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, - FLOAT * __restrict dst) +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; - FLOAT *psrc0; - FLOAT *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *psrc5, *psrc6, *psrc7, *psrc8; - FLOAT *pdst; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -58,22 +55,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (m >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - LD_SP2(psrc3, 4, src4, src5); - LD_SP2(psrc4, 4, src6, src7); - LD_SP2(psrc5, 4, src8, src9); - LD_SP2(psrc6, 4, src10, src11); - LD_SP2(psrc7, 4, src12, src13); - LD_SP2(psrc8, 4, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6); TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5, @@ -83,15 +72,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13, dst15); - ST_SP2(dst0, dst1, pdst, 4); - ST_SP2(dst2, dst3, pdst + 8, 4); - ST_SP2(dst4, dst5, pdst + 16, 4); - ST_SP2(dst6, dst7, pdst + 24, 4); - ST_SP2(dst8, dst9, pdst + 32, 4); - ST_SP2(dst10, dst11, pdst + 40, 4); - ST_SP2(dst12, dst13, pdst + 48, 4); - ST_SP2(dst14, dst15, pdst + 56, 4); - pdst += 64; + ST_SP2_INC(dst0, dst1, pdst, 4); + ST_SP2_INC(dst2, dst3, pdst, 4); + ST_SP2_INC(dst4, dst5, pdst, 4); + ST_SP2_INC(dst6, dst7, pdst, 4); + ST_SP2_INC(dst8, dst9, pdst, 4); + ST_SP2_INC(dst10, dst11, pdst, 4); + ST_SP2_INC(dst12, dst13, pdst, 4); + ST_SP2_INC(dst14, dst15, pdst, 4); } for (i = (m & 7); i--;) @@ -128,9 +116,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3); - ST_SP2(dst0, dst1, pdst, 4); - ST_SP2(dst2, dst3, pdst + 8, 4); - pdst += 16; + ST_SP2_INC(dst0, dst1, pdst, 4); + ST_SP2_INC(dst2, dst3, pdst, 4); } for (i = (m & 3); i--;) diff --git a/kernel/mips/sgemm_tcopy_8_msa.c b/kernel/mips/sgemm_tcopy_8_msa.c index 7d4aecb4b..3542eca21 100644 --- a/kernel/mips/sgemm_tcopy_8_msa.c +++ b/kernel/mips/sgemm_tcopy_8_msa.c @@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, - FLOAT * __restrict dst) +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; - FLOAT *psrc0; - FLOAT *psrc1, *psrc2, *psrc3, *psrc4; - FLOAT *psrc5, *psrc6, *psrc7, *psrc8; - FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; + FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; @@ -63,22 +60,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - LD_SP2(psrc3, 4, src4, src5); - LD_SP2(psrc4, 4, src6, src7); - LD_SP2(psrc5, 4, src8, src9); - LD_SP2(psrc6, 4, src10, src11); - LD_SP2(psrc7, 4, src12, src13); - LD_SP2(psrc8, 4, src14, src15); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; - psrc5 += 8; - psrc6 += 8; - psrc7 += 8; - psrc8 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); + LD_SP2_INC(psrc5, 4, src8, src9); + LD_SP2_INC(psrc6, 4, src10, src11); + LD_SP2_INC(psrc7, 4, src12, src13); + LD_SP2_INC(psrc8, 4, src14, src15); ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15, @@ -105,8 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc7 += 4; psrc8 += 4; - ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); - pdst2 += 32; + ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); } if (n & 2) @@ -155,14 +143,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - LD_SP2(psrc3, 4, src4, src5); - LD_SP2(psrc4, 4, src6, src7); - psrc1 += 8; - psrc2 += 8; - psrc3 += 8; - psrc4 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); + LD_SP2_INC(psrc3, 4, src4, src5); + LD_SP2_INC(psrc4, 4, src6, src7); ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); pdst1 += 8 * m; @@ -179,8 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc3 += 4; psrc4 += 4; - ST_SP4(src0, src1, src2, src3, pdst2, 4); - pdst2 += 16; + ST_SP4_INC(src0, src1, src2, src3, pdst2, 4); } if (n & 2) @@ -215,10 +198,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - LD_SP2(psrc2, 4, src2, src3); - psrc1 += 8; - psrc2 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); + LD_SP2_INC(psrc2, 4, src2, src3); ST_SP4(src0, src1, src2, src3, pdst1, 4); pdst1 += 8 * m; @@ -231,8 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, psrc1 += 4; psrc2 += 4; - ST_SP2(src0, src1, pdst2, 4); - pdst2 += 8; + ST_SP2_INC(src0, src1, pdst2, 4); } if (n & 2) @@ -260,8 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, for (i = (n >> 3); i--;) { - LD_SP2(psrc1, 4, src0, src1); - psrc1 += 8; + LD_SP2_INC(psrc1, 4, src0, src1); ST_SP2(src0, src1, pdst1, 4); pdst1 += 8 * m; @@ -288,5 +267,5 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, } } - return 0; + return 0; } diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c index 516b9752f..53891e64f 100644 --- a/kernel/mips/strsm_kernel_LN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -166,7 +166,7 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); + src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); res_c4 *= src_a36; res_c12 *= src_a36; @@ -220,9 +220,9 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c0 -= res_c2 * src_a16; res_c8 -= res_c10 * src_a16; - COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); - COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); + src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c1 *= src_a9; res_c9 *= src_a9; @@ -306,7 +306,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); @@ -374,7 +374,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); + src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -399,9 +399,9 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c1 -= res_c2 * src_a17; res_c0 -= res_c2 * src_a16; - COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); - COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); + src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c1 *= src_a9; res_c0 -= res_c1 * src_a8; @@ -826,9 +826,9 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); - COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); + src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c3 *= src_a15; res_c7 *= src_a15; @@ -916,7 +916,7 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(aa); @@ -940,9 +940,9 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); - COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); - COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); + src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); + src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); + src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c3 *= src_a15; res_c2 -= res_c3 * src_a14; diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c index c087fdae5..5834d77b2 100644 --- a/kernel/mips/strsm_kernel_LT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -162,7 +162,7 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); + src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); res_c3 *= src_a27; res_c11 *= src_a27; @@ -216,9 +216,9 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c7 -= res_c5 * src_a47; res_c15 -= res_c13 * src_a47; - COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); - COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); - COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); + src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); + src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); + src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); res_c6 *= src_a54; res_c14 *= src_a54; @@ -334,7 +334,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); + src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); res_c3 *= src_a27; res_c4 -= res_c3 * src_a28; @@ -359,9 +359,9 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); - COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); - COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); + src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); + src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); + src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); res_c6 *= src_a54; res_c7 -= res_c6 * src_a55; @@ -780,7 +780,7 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 8; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); @@ -813,9 +813,9 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); - COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); - COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); + src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); + src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); + src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); res_c0 *= src_a0; res_c4 *= src_a0; @@ -902,7 +902,7 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); @@ -926,9 +926,9 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); - COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); - COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); + src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); + src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); + src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c index 69d7b5f72..642ee3757 100644 --- a/kernel/mips/strsm_kernel_RN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -144,7 +144,7 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); + src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); src_c4 *= src_b18; src_c5 *= src_b18; @@ -184,9 +184,9 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); - COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); - COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); + src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); + src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); + src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); src_c8 *= src_b36; src_c9 *= src_b36; @@ -275,7 +275,7 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(a, 4, src_a0, src_a1); @@ -300,9 +300,9 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); - COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); - COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); + src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); + src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); + src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -351,8 +351,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -364,8 +364,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -376,12 +376,12 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 2; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -392,9 +392,9 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 2; } - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -419,7 +419,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -429,7 +429,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -439,7 +439,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -449,7 +449,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -458,13 +458,13 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 1; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -474,7 +474,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -487,7 +487,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(a, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -497,7 +497,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -574,7 +574,7 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); + src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); src_b = LD_SP(b + 36); SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); @@ -584,9 +584,9 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); - COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); - COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); + src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); + src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); + src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -686,7 +686,7 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); @@ -707,9 +707,9 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); - COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); - COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); + src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); + src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); + src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -789,7 +789,7 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 2; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { @@ -831,9 +831,9 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c index eefd3a665..21e41c8fb 100644 --- a/kernel/mips/strsm_kernel_RT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -158,7 +158,7 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); + src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); src_c8 *= src_b36; src_c9 *= src_b36; @@ -203,9 +203,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); - COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); - COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); + src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c2 *= src_b9; src_c3 *= src_b9; @@ -273,7 +273,7 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); @@ -298,9 +298,9 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); - COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); + src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c7 *= src_b15; src_c6 *= src_b15; @@ -350,8 +350,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -363,8 +363,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -375,12 +375,12 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 2; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); + src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -391,9 +391,9 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); + src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c2 *= src_b3; src_c3 *= src_b3; @@ -419,7 +419,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -429,7 +429,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -439,7 +439,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -449,7 +449,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -458,13 +458,13 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) bb += 1; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -474,7 +474,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -487,7 +487,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { LD_SP2(aa, 4, src_a0, src_a1); - COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; @@ -497,7 +497,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -579,7 +579,7 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); + src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); src_b = LD_SP(b + 24); SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); @@ -589,9 +589,9 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); - COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); - COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); + src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c7 *= src_b63; src_c6 -= src_c7 * src_b62; @@ -695,7 +695,7 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 4; } - if (bk & 1) + if ((bk & 1) && (bk > 0)) { src_a = LD_SP(aa); @@ -717,9 +717,9 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); - COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); + src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c3 *= src_b15; src_c2 -= src_c3 * src_b14; @@ -800,7 +800,7 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO bb += 2; } - if (bk & 3) + if ((bk & 3) && (bk > 0)) { if (bk & 2) { @@ -842,9 +842,9 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); - COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); - COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); + src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); + src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c1 *= src_b3; src_c0 -= src_c1 * src_b2; diff --git a/kernel/mips/zgemm_kernel_4x4_msa.c b/kernel/mips/zgemm_kernel_4x4_msa.c new file mode 100644 index 000000000..a185c69dd --- /dev/null +++ b/kernel/mips/zgemm_kernel_4x4_msa.c @@ -0,0 +1,1589 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = OP4 src_a1r * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_D2_DP(src_b2, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + res5_r OP0## = src_a1r * src_br; \ + res5_r OP1## = src_a1i * src_bi; \ + res5_i OP2## = OP4 src_a1r * src_bi; \ + res5_i OP3## = src_a1i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_D2_DP(src_b3, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ + \ + res7_r OP0## = src_a1r * src_br; \ + res7_r OP1## = src_a1i * src_bi; \ + res7_i OP2## = OP4 src_a1r * src_bi; \ + res7_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + /* 2nd col */ \ + SPLATI_D2_DP(src_b2, src_br, src_bi); \ + res4_r OP0## = src_a0r * src_br; \ + res4_r OP1## = src_a0i * src_bi; \ + res4_i OP2## = OP4 src_a0r * src_bi; \ + res4_i OP3## = src_a0i * src_br; \ + \ + /* 3rd col */ \ + SPLATI_D2_DP(src_b3, src_br, src_bi); \ + res6_r OP0## = src_a0r * src_br; \ + res6_r OP1## = src_a0i * src_bi; \ + res6_i OP2## = OP4 src_a0r * src_bi; \ + res6_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + src_a0 = LD_DP(pa0); \ + LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ + \ + PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ + \ + /* 0th and 1st col */ \ + PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 2nd and 3rd col */ \ + PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \ + res1_r OP0## = src_a0r * src_br; \ + res1_r OP1## = src_a0i * src_bi; \ + res1_i OP2## = OP4 src_a0r * src_bi; \ + res1_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ + \ + res3_r OP0## = src_a1r * src_br; \ + res3_r OP1## = src_a1i * src_bi; \ + res3_i OP2## = OP4 src_a1r * src_bi; \ + res3_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + /* 1st col */ \ + SPLATI_D2_DP(src_b1, src_br, src_bi); \ + res2_r OP0## = src_a0r * src_br; \ + res2_r OP1## = src_a0i * src_bi; \ + res2_i OP2## = OP4 src_a0r * src_bi; \ + res2_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + src_a0 = LD_DP(pa0); \ + LD_DP2_INC(pb0, 2, src_b0, src_b1); \ + \ + PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ + \ + /* 0th and 1st col */ \ + PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ + src_b0 = LD_DP(pb0); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ + \ + res1_r OP0## = src_a1r * src_br; \ + res1_r OP1## = src_a1i * src_bi; \ + res1_i OP2## = OP4 src_a1r * src_bi; \ + res1_i OP3## = src_a1i * src_br; \ +} + +#define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \ +{ \ + LD_DP2_INC(pa0, 2, src_a0, src_a1); \ + src_b0 = LD_DP(pb0); \ + \ + PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ + \ + /* 0th col */ \ + SPLATI_D2_DP(src_b0, src_br, src_bi); \ + res0_r OP0## = src_a0r * src_br; \ + res0_r OP1## = src_a0i * src_bi; \ + res0_i OP2## = OP4 src_a0r * src_bi; \ + res0_i OP3## = src_a0i * src_br; \ +} + +#define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ +{ \ + /* 0th col */ \ + a0_r = pa0[0]; \ + a0_i = pa0[1]; \ + b0_r = pb0[0]; \ + b0_i = pb0[1]; \ + \ + res0 OP0## = a0_r * b0_r; \ + res0 OP1## = a0_i * b0_i; \ + res1 OP2## = OP4 a0_r * b0_i; \ + res1 OP3## = a0_i * b0_r; \ +} + +#define ZGEMM_SCALE_4X4_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ + \ + LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r += alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i += alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r += alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i += alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ +} + +#define ZGEMM_SCALE_2X4_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + LD_DP2(pc1, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ + \ + LD_DP2(pc2, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i += alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + LD_DP2(pc3, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i += alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc2, 2); \ + ST_DP2_INC(dst2, dst3, pc3, 2); \ +} + +#define ZGEMM_SCALE_1X4_MSA \ +{ \ + dst0 = LD_DP(pc0); \ + dst1 = LD_DP(pc1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst2 = LD_DP(pc2); \ + dst3 = LD_DP(pc3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res1_r; \ + dst0_r -= alpha_i * res1_i; \ + dst0_i += alpha_r * res1_i; \ + dst0_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ + ST_DP(dst2, pc2); \ + ST_DP(dst3, pc3); \ +} + +#define ZGEMM_SCALE_4X2_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ + \ + PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r += alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i += alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ +} + +#define ZGEMM_SCALE_2X2_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + \ + LD_DP2(pc1, 2, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i += alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ +} + +#define ZGEMM_SCALE_1X2_MSA \ +{ \ + dst0 = LD_DP(pc0); \ + dst1 = LD_DP(pc1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ +} + +#define ZGEMM_SCALE_4X1_MSA \ +{ \ + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r += alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i += alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ +} + +#define ZGEMM_SCALE_2X1_MSA \ +{ \ + LD_DP2(pc0, 2, dst0, dst1); \ + \ + PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ + \ + dst0_r += alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i += alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ +} + +#define ZGEMM_SCALE_1X1 \ +{ \ + pc0[0] += alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] += alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +#define ZGEMM_TRMM_SCALE_4X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + dst1_r = alpha_r * res5_r; \ + dst1_r -= alpha_i * res5_i; \ + dst1_i = alpha_r * res5_i; \ + dst1_i += alpha_i * res5_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + dst1_r = alpha_r * res7_r; \ + dst1_r -= alpha_i * res7_i; \ + dst1_i = alpha_r * res7_i; \ + dst1_i += alpha_i * res7_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ + \ + dst0_r = alpha_r * res4_r; \ + dst0_r -= alpha_i * res4_i; \ + dst0_i = alpha_r * res4_i; \ + dst0_i += alpha_i * res4_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res6_r; \ + dst0_r -= alpha_i * res6_i; \ + dst0_i = alpha_r * res6_i; \ + dst0_i += alpha_i * res6_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst0, dst1, pc2, 2); \ + ST_DP2_INC(dst2, dst3, pc3, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X4_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + dst0_r = alpha_r * res1_r; \ + dst0_r -= alpha_i * res1_i; \ + dst0_i = alpha_r * res1_i; \ + dst0_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ + ST_DP(dst2, pc2); \ + ST_DP(dst3, pc3); \ +} + +#define ZGEMM_TRMM_SCALE_4X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + dst1_r = alpha_r * res3_r; \ + dst1_r -= alpha_i * res3_i; \ + dst1_i = alpha_r * res3_i; \ + dst1_i += alpha_i * res3_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ + ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ + \ + dst0_r = alpha_r * res2_r; \ + dst0_r -= alpha_i * res2_i; \ + dst0_i = alpha_r * res2_i; \ + dst0_i += alpha_i * res2_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ + \ + ST_DP2_INC(dst2, dst3, pc1, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X2_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP(dst0, pc0); \ + ST_DP(dst1, pc1); \ +} + +#define ZGEMM_TRMM_SCALE_4X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + dst1_r = alpha_r * res1_r; \ + dst1_r -= alpha_i * res1_i; \ + dst1_i = alpha_r * res1_i; \ + dst1_i += alpha_i * res1_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ + \ + ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ +} + +#define ZGEMM_TRMM_SCALE_2X1_MSA \ +{ \ + dst0_r = alpha_r * res0_r; \ + dst0_r -= alpha_i * res0_i; \ + dst0_i = alpha_r * res0_i; \ + dst0_i += alpha_i * res0_r; \ + \ + ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ + \ + ST_DP2_INC(dst0, dst1, pc0, 2); \ +} + +#define ZGEMM_TRMM_SCALE_1X1 \ +{ \ + pc0[0] = alphar * res0; \ + pc0[0] -= alphai * res1; \ + pc0[1] = alphar * res1; \ + pc0[1] += alphai * res0; \ +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, + FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif + FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; + FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3; + v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i; + v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; + v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; + + alpha_r = COPY_DOUBLE_TO_VECTOR(alphar); + alpha_i = COPY_DOUBLE_TO_VECTOR(alphai); + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j = (n >> 2); j--;) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + pc2 = pc1 + 2 * ldc; + pc3 = pc2 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X4_MSA +#else + ZGEMM_SCALE_4X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X4_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X4_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X4_MSA +#else + ZGEMM_SCALE_2X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X4_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X4_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X4_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X4_MSA(, -, , -, -); +#endif + + pa0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X4_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X4_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X4_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X4_MSA(+, -, -, -,); +#endif + + pa0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X4_MSA +#else + ZGEMM_SCALE_1X4_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + l = k << 3; + B = B + l; + i = ldc << 3; + C = C + i; + } + + if (n & 2) + { + pc0 = C; + pc1 = pc0 + 2 * ldc; + + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X2_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X2_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X2_MSA +#else + ZGEMM_SCALE_4X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X2_MSA(, -, , -, -); +#endif + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X2_MSA(+, -, -, -,); +#endif + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X2_MSA +#else + ZGEMM_SCALE_2X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X2_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X2_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X2_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X2_MSA(, -, , -, -); +#endif + + pa0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X2_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X2_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X2_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X2_MSA(+, -, -, -,); +#endif + + pa0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X2_MSA +#else + ZGEMM_SCALE_1X2_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + pc1 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + l = k << 2; + B = B + l; + i = ldc << 2; + C = C + i; + } + + if (n & 1) + { + pc0 = C; + pa0 = A; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + for (i = (m >> 2); i--;) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 4; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_4X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_4X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_4X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_4X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_4X1_MSA +#else + ZGEMM_SCALE_4X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 4; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif + } + + if (m & 2) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 2; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X1_MSA(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X1_MSA(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X1_MSA(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X1_MSA(, -, , -, -); +#endif + + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_2X1_MSA(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_2X1_MSA(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_2X1_MSA(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_2X1_MSA(+, -, -, -,); +#endif + + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_2X1_MSA +#else + ZGEMM_SCALE_2X1_MSA +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 2; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif + } + + if (m & 1) + { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 2 * 1; + pb0 = B + off * 2 * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else + pb0 = B; + temp = k; +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X1(, -, , +, +); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X1(, +, , +, -); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X1(, +, , -, +); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X1(, -, , -, -); +#endif + + pa0 += 2; + pb0 += 2; + + for (l = (temp - 1); l--;) + { +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ZGEMM_KERNEL_1X1(+, -, +, +,); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + ZGEMM_KERNEL_1X1(+, +, -, +,); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + ZGEMM_KERNEL_1X1(+, +, +, -,); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + ZGEMM_KERNEL_1X1(+, -, -, -,); +#endif + + pa0 += 2; + pb0 += 2; + } + +#if defined(TRMMKERNEL) + ZGEMM_TRMM_SCALE_1X1 +#else + ZGEMM_SCALE_1X1 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2 * 1; + pb0 += temp * 2 * 1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif + + pc0 += 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + + l = k << 1; + B = B + l; + i = ldc << 1; + C = C + i; + } + return 0; +} diff --git a/kernel/mips/zgemm_ncopy_4_msa.c b/kernel/mips/zgemm_ncopy_4_msa.c new file mode 100644 index 000000000..3ef46a571 --- /dev/null +++ b/kernel/mips/zgemm_ncopy_4_msa.c @@ -0,0 +1,144 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst = dst; + lda *= 2; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); + ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15, + pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src4, src5); + LD_DP2_INC(psrc3, 2, src8, src9); + LD_DP2_INC(psrc4, 2, src12, src13); + + ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + src4 = LD_DP(psrc2); + src8 = LD_DP(psrc3); + src12 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src4, src8, src12, pdst, 2); + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + + ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src4, src5); + + ST_DP4_INC(src0, src4, src1, src5, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + src4 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2_INC(src0, src4, pdst, 2); + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + ST_DP4_INC(src0, src1, src2, src3, pdst, 2); + } + + if (m & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + ST_DP2_INC(src0, src1, pdst, 2); + } + + if (m & 1) + { + src0 = LD_DP(psrc1); + ST_DP(src0, pdst); + } + } + + return 0; +} diff --git a/kernel/mips/zgemm_tcopy_4_msa.c b/kernel/mips/zgemm_tcopy_4_msa.c new file mode 100644 index 000000000..70314cb21 --- /dev/null +++ b/kernel/mips/zgemm_tcopy_4_msa.c @@ -0,0 +1,161 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + lda *= 2; + + pdst2 = dst + 2 * m * (n & ~3); + pdst3 = dst + 2 * m * (n & ~1); + + for (j = (m >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); + LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + LD_DP2_INC(psrc3, 2, src4, src5); + LD_DP2_INC(psrc4, 2, src6, src7); + + ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + LD_DP2_INC(psrc2, 2, src2, src3); + + ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + + ST_DP2_INC(src0, src1, pdst3, 2); + + psrc1 += 2; + psrc2 += 2; + } + } + + if (m & 1) + { + psrc1 = psrc0; + pdst1 = pdst0; + + for (i = (n >> 2); i--;) + { + LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); + ST_DP4(src0, src1, src2, src3, pdst1, 2); + + pdst1 += m * 8; + } + + if (n & 2) + { + LD_DP2_INC(psrc1, 2, src0, src1); + ST_DP2_INC(src0, src1, pdst2, 2); + } + + if (n & 1) + { + src0 = LD_DP(psrc1); + ST_DP(src0, pdst3); + } + } + + return 0; +} diff --git a/param.h b/param.h index fdc9d1104..dd58744d4 100644 --- a/param.h +++ b/param.h @@ -2188,11 +2188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 - -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 @@ -2227,11 +2227,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 From 925d4e1dc69a94de5733c03b022b7c2c7521935e Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:46:01 +0530 Subject: [PATCH 51/70] Add IAMAX and NRM2 benchmarks --- benchmark/Makefile | 92 +++++++++++++++++++++- benchmark/iamax.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ benchmark/nrm2.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 benchmark/iamax.c create mode 100644 benchmark/nrm2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index e78750ec2..e801ce4eb 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling + smallscaling \ + isamax.goto idamax.goto icamax.goto izamax.goto \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ @@ -1937,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX) zgemm3m.veclib : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +isamax.atlas : isamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +idamax.atlas : idamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +icamax.atlas : icamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +izamax.atlas : izamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## SNRM2 ############################################## +snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +snrm2.atlas : snrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## DNRM2 ############################################## +dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dnrm2.atlas : dnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Sscnrm2 ############################################## +scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scnrm2.atlas : scnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Ddznrm2 ############################################## +dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dznrm2.atlas : dznrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ################################################################################################### slinpack.$(SUFFIX) : linpack.c @@ -2243,6 +2304,33 @@ cgemm3m.$(SUFFIX) : gemm3m.c zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +snrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +scnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +dznrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + smallscaling: smallscaling.c ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread diff --git a/benchmark/iamax.c b/benchmark/iamax.c new file mode 100644 index 000000000..c55f41579 --- /dev/null +++ b/benchmark/iamax.c @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef IAMAX + +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Time\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef NRM2 + +#ifdef COMPLEX +#ifdef DOUBLE +#define NRM2 BLASFUNC(dznrm2) +#else +#define NRM2 BLASFUNC(scnrm2) +#endif +#else +#ifdef DOUBLE +#define NRM2 BLASFUNC(dnrm2) +#else +#define NRM2 BLASFUNC(snrm2) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Time\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Thu, 14 Jul 2016 13:48:13 +0530 Subject: [PATCH 52/70] Add time prints in benchmark output --- benchmark/asum.c | 4 ++-- benchmark/axpy.c | 4 ++-- benchmark/copy.c | 4 ++-- benchmark/dot.c | 4 ++-- benchmark/gemv.c | 4 ++-- benchmark/rot.c | 4 ++-- benchmark/scal.c | 4 ++-- benchmark/swap.c | 4 ++-- benchmark/trmm.c | 4 ++-- benchmark/zdot.c | 4 ++-- 10 files changed, 20 insertions(+), 20 deletions(-) diff --git a/benchmark/asum.c b/benchmark/asum.c index beb6402f4..78ccdf47b 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -183,9 +183,9 @@ int main(int argc, char *argv[]){ timeg /= loops; #ifdef COMPLEX - fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); #else - fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); #endif } diff --git a/benchmark/axpy.c b/benchmark/axpy.c index a7206b690..37c7aeb63 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/copy.c b/benchmark/copy.c index 15c45201c..ea5b38d68 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); + " %10.2f MBytes %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/dot.c b/benchmark/dot.c index 4c8d6cc38..50d05e532 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/gemv.c b/benchmark/gemv.c index 42af2825a..c06e829d9 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -221,7 +221,7 @@ int main(int argc, char *argv[]){ timeg /= loops; - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); } } @@ -258,7 +258,7 @@ int main(int argc, char *argv[]){ timeg /= loops; - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); } } diff --git a/benchmark/rot.c b/benchmark/rot.c index 32322bebb..3ff783cc6 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -186,8 +186,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/scal.c b/benchmark/scal.c index 4c2da4d30..453c3234d 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -189,9 +189,9 @@ int main(int argc, char *argv[]){ timeg /= loops; #ifdef COMPLEX - fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg); #else - fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg); #endif } diff --git a/benchmark/swap.c b/benchmark/swap.c index 9f108ef50..368c59cd4 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); + " %10.2f MBytes %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index f81e9d912..54c7972db 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ gettimeofday( &start, (struct timezone *)0); fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); } diff --git a/benchmark/zdot.c b/benchmark/zdot.c index d5ec99726..ed9d4d2e8 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } From 78782485b6f859d72be854ba6c2a0ec52d137adb Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:49:15 +0530 Subject: [PATCH 53/70] Improvements to COPY and IAMAX kernels --- kernel/arm64/copy.S | 46 ++++----- kernel/arm64/iamax.S | 184 +++++++++++++++++++++++++++++++++++ kernel/arm64/izamax.S | 217 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 424 insertions(+), 23 deletions(-) diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S index 17aa5a1e8..70eab96fb 100644 --- a/kernel/arm64/copy.S +++ b/kernel/arm64/copy.S @@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. str TMPF, [Y], #SZ #else #if !defined(DOUBLE) - ld1 {v0.2s}, [X], #8 - st1 {v0.2s}, [Y], #8 + ldr d0, [X], #8 + str d0, [Y], #8 #else - ld1 {v0.2d}, [X], #16 - st1 {v0.2d}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #endif #endif .endm .macro KERNEL_F4 - #if !defined(COMPLEX) #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + #endif #else // COMPLEX #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - ld1 {v2.4s}, [X], #16 - ld1 {v3.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 - st1 {v2.4s}, [Y], #16 - st1 {v3.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + ldr q2, [X], #16 + str q2, [Y], #16 + ldr q3, [X], #16 + str q3, [Y], #16 #endif #endif diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S index 575c15e53..6c0d84f98 100644 --- a/kernel/arm64/iamax.S +++ b/kernel/arm64/iamax.S @@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fabs MAXF, MAXF .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #2 + add x7, x7, x6 + ldp q2, q3, [x7] + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + + ins v4.s[0], v3.s[0] + ins v5.s[0], v3.s[1] + ins v6.s[0], v3.s[2] + ins v7.s[0], v3.s[3] + + add x6, INDEX, #7 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + ins v5.s[0], v2.s[1] + ins v6.s[0], v2.s[2] + ins v7.s[0], v2.s[3] + + sub x6, x6, #1 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + add x6, x6, #4 + lsl x6, x6, #3 + add x7, x7, x6 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq + + sub x7, x7, #32 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq +#endif +.endm + + .macro KERNEL_S1 ld1 TMPVF, [X], INC_X add Z, Z, #1 @@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + beq iamax_kernel_F1 + + add Z, Z, #1 +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1 diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S index ebdc671e0..9b252ec98 100644 --- a/kernel/arm64/izamax.S +++ b/kernel/arm64/izamax.S @@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + ldp q16, q17, [X], #32 + ldp q18, q19, [X], #32 + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #3 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + ins v4.s[0], v3.s[3] + add x6, INDEX, #7 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[3] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + lsl x6, x6, #4 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + ldp q16, q17, [x7, #64] + ldp q18, q19, [x7, #96] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + ins v7.d[0], v5.d[1] + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v5.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq +#endif +.endm + .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v1.2s}, [X], INC_X @@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + ble iamax_kernel_F1 + + add Z, Z, #1 + +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1 From 8a40f1355e9711ce3d661c214f1644075c1e497b Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:50:38 +0530 Subject: [PATCH 54/70] Improvements to GEMV kernels --- kernel/arm64/gemv_n.S | 9 ++ kernel/arm64/gemv_t.S | 17 ++- kernel/arm64/zgemv_n.S | 249 ++++++++++++++++------------------------- kernel/arm64/zgemv_t.S | 9 +- 4 files changed, 128 insertions(+), 156 deletions(-) diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S index 6279c2250..162f721c3 100644 --- a/kernel/arm64/gemv_n.S +++ b/kernel/arm64/gemv_n.S @@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SHZ 3 #endif +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************/ .macro SAVE_REGS @@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.4s, v3.4s}, [A_PTR], #32 ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 fmla v4.4s, v1.4s, v2.4s + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.4s, v1.4s, v3.4s st1 {v4.4s, v5.4s}, [Y_OPTR], #32 ld1 {v6.4s, v7.4s}, [A_PTR], #32 ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 fmla v8.4s, v1.4s, v6.4s + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.4s, v1.4s, v7.4s st1 {v8.4s, v9.4s}, [Y_OPTR], #32 #else //DOUBLE ld1 {v2.2d, v3.2d}, [A_PTR], #32 ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 fmla v4.2d, v1.2d, v2.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.2d, v1.2d, v3.2d st1 {v4.2d, v5.2d}, [Y_OPTR], #32 ld1 {v6.2d, v7.2d}, [A_PTR], #32 ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 fmla v8.2d, v1.2d, v6.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.2d, v1.2d, v7.2d st1 {v8.2d, v9.2d}, [Y_OPTR], #32 ld1 {v10.2d, v11.2d}, [A_PTR], #32 ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 fmla v12.2d, v1.2d, v10.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v13.2d, v1.2d, v11.2d st1 {v12.2d, v13.2d}, [Y_OPTR], #32 ld1 {v14.2d, v15.2d}, [A_PTR], #32 ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 fmla v16.2d, v1.2d, v14.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v17.2d, v1.2d, v15.2d st1 {v16.2d, v17.2d}, [Y_OPTR], #32 #endif diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index 0145af621..28325f784 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define X_PREFETCH_SIZE 768 +#define A_PREFETCH_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 fmla v1.4s, v5.4s, v9.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v6.4s, v10.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v7.4s, v11.4s + ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 fmla v4.4s, v8.4s, v12.4s - ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 fmla v1.4s, v13.4s, v17.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v14.4s, v18.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v15.4s, v19.4s fmla v4.4s, v16.4s, v20.4s #else ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d #endif diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index 9e285e299..a28d1b0ce 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y_OPTR x13 /* loop Y vector address */ #define X_PTR x14 /* loop X vector address */ +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define ALPHA_R s0 #define ALPHA_I s1 -#define ALPHA_R_COPY s7 -#define ALPHA_I_COPY s8 #define SHZ 3 #else #define ALPHA_R d0 #define ALPHA_I d1 -#define ALPHA_R_COPY d7 -#define ALPHA_I_COPY d8 #define SHZ 4 #endif @@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT - /********** INIT FOR F4 LOOP **********/ - fmov ALPHA_R_COPY, ALPHA_R - fmov ALPHA_I_COPY, ALPHA_I -#if !defined(DOUBLE) - ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) - ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) - ins v7.d[1], v7.d[0] - ins v8.d[1], v8.d[0] -#else - ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) - ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) -#endif - - /******* INIT FOR F1 AND S1 LOOP ******/ #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) eor v2.16b, v2.16b, v2.16b @@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro INIT_LOOP - /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(DOUBLE) - ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] - ins v10.s[0], v9.s[1] - ins v9.s[1], v9.s[0] // [R(X), R(X)] - ins v10.s[1], v10.s[0] // [I(X), I(X)] - ins v9.d[1], v9.d[0] - ins v10.d[1], v10.d[0] + ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] + ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] + fmul v2.2s, v0.2s, v2.2s + fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] + ins v3.s[0], v2.s[1] + + /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(CONJ) #if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // -I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] #else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v24.4s, v25.s[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] #else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // I[TEMP] + dup v24.4s, v25.s[0] // I[TEMP] #endif #endif // CONJ + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ - ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] - ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] - fmul v2.2s, v0.2s, v2.2s - fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] - ins v3.s[0], v2.s[1] #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif // CONJ #else // DOUBLE + ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] + ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] + fmul v2.2d, v0.2d, v2.2d + fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] + ins v3.d[0], v2.d[1] // I(TEMP) - /********** INIT_LOOP FOR F4 LOOP **********/ - ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] - ins v10.d[0], v9.d[1] - ins v9.d[1], v9.d[0] // [R(X), R(X)] - ins v10.d[1], v10.d[0] // [I(X), I(X)] + /****** INIT_LOOP FOR F4 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // -I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] #else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v24.2d, v25.d[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] #else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // I[TEMP] + dup v24.2d, v25.d[0] // I[TEMP] #endif #endif // CONJ + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ - ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] - ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] - fmul v2.2d, v0.2d, v2.2d - fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] - ins v3.d[0], v2.d[1] // I(TEMP) #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v13.4s, v14.4s}, [A_PTR], #32 ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v15.4s, v21.4s, v13.4s + fmla v15.4s, v23.4s, v14.4s + fmla v16.4s, v22.4s, v14.4s + fmla v16.4s, v24.4s, v13.4s + st2 {v15.4s, v16.4s}, [Y_OPTR], #32 #else // DOUBLE ld2 {v13.2d, v14.2d}, [A_PTR], #32 ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + + fmla v15.2d, v21.2d, v13.2d + fmla v15.2d, v23.2d, v14.2d + fmla v16.2d, v22.2d, v14.2d + fmla v16.2d, v24.2d, v13.2d + st2 {v15.2d, v16.2d}, [Y_OPTR], #32 ld2 {v17.2d, v18.2d}, [A_PTR], #32 ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v19.2d, v21.2d, v17.2d + fmla v19.2d, v23.2d, v18.2d + fmla v20.2d, v22.2d, v18.2d + fmla v20.2d, v24.2d, v17.2d + st2 {v19.2d, v20.2d}, [Y_OPTR], #32 #endif @@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP: zgemv_n_kernel_F4: - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 + KERNEL_F4 subs I, I, #1 bne zgemv_n_kernel_F4 diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index e61c17152..79ce9bcf2 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define A_PRE_SIZE 768 +#define X_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v11.4s, v12.4s}, [X_PTR], #32 ld2 {v13.4s, v14.4s}, [A_PTR], #32 + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] @@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else // DOUBLE ld2 {v11.2d, v12.2d}, [X_PTR], #32 ld2 {v13.2d, v14.2d}, [A_PTR], #32 - prfm PLDL1STRM, [X_PTR, #512] + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] @@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v17.2d, v18.2d}, [X_PTR], #32 ld2 {v19.2d, v20.2d}, [A_PTR], #32 - prfm PLDL1STRM, [A_PTR, #512] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] From 0a5ff9f9f97e960589ca92618c677b72cb2e85fe Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:51:17 +0530 Subject: [PATCH 55/70] Improvements to TRMM and GEMM kernels --- kernel/arm64/cgemm_kernel_8x4.S | 592 ++++++++++++--------- kernel/arm64/ctrmm_kernel_8x4.S | 640 +++++++++++++---------- kernel/arm64/dtrmm_kernel_8x4.S | 456 ++++++++++------- kernel/arm64/sgemm_kernel_16x4.S | 850 +++++++++++++++++-------------- kernel/arm64/strmm_kernel_16x4.S | 808 +++++++++++++++-------------- kernel/arm64/zgemm_kernel_4x4.S | 419 ++++++++------- kernel/arm64/ztrmm_kernel_4x4.S | 422 ++++++++------- param.h | 6 +- 8 files changed, 2380 insertions(+), 1813 deletions(-) diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index d58cef52d..5d1462808 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -46,17 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 +#define pCRow3 x15 +#define pA x16 +#define alphaR w17 +#define alphaI w18 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] -#define alpha1_R s14 -#define alphaV1_R v14.s[0] -#define alpha1_I s15 -#define alphaV1_I v15.s[0] +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -95,8 +97,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 +// 15 pCRow3 +// 16 pA // 17 // 18 must save // 19 must save @@ -121,14 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I -//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R -//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I -//v10 must save ALPHA0_R -//v11 must save ALPHA0_I -//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R -//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I -//v14 must save ALPHA1_R -//v15 must save ALPHA1_I +//v08 must save pB0_00_R, pB0_01_R +//v09 must save pB0_00_I, pB0_01_I +//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -171,8 +173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -189,6 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -200,6 +206,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -211,6 +220,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -222,56 +234,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -280,47 +295,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -329,47 +353,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -388,157 +419,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] - - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] - - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] - + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] + + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] + + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - ld2 {v0.4s, v1.4s}, [pCRow1] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] - add pCRow2, pCRow1, #32 + add pCRow0, pCRow0, #32 - ld2 {v2.4s, v3.4s}, [pCRow2] + ld2 {v2.4s, v3.4s}, [pCRow0] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] - add pCRow1, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 + add pCRow1, pCRow1, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] - add pCRow1, pCRow1, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - ld2 {v0.4s, v1.4s}, [pCRow1] + ld2 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] - add pCRow2, pCRow1, #32 + add pCRow2, pCRow2, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmla v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmla v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.4s, v5.4s}, [pCRow1] + ld2 {v4.4s, v5.4s}, [pCRow3] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] - add pCRow2, pCRow1, #32 + add pCRow3, pCRow3, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow3] fmla v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmla v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -720,13 +768,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -734,8 +785,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -743,8 +794,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -752,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -800,13 +851,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -814,8 +868,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -823,8 +877,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmla v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmla v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -832,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmla v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmla v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -880,13 +934,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -894,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -903,8 +960,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmla s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmla s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -912,8 +969,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmla s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmla s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -962,13 +1019,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -976,8 +1036,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -985,8 +1045,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -994,8 +1054,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1028,13 +1088,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1042,8 +1105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1076,13 +1139,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1090,8 +1156,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1124,13 +1190,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1138,8 +1207,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1174,13 +1243,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1188,8 +1260,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1216,13 +1288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1248,13 +1323,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1281,13 +1359,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1313,10 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1330,8 +1412,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ cgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1342,44 +1428,69 @@ cgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble cgemm_kernel_L4_M4_BEGIN + .align 5 cgemm_kernel_L4_M8_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt cgemm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble cgemm_kernel_L4_M8_22a - .align 5 + .align 5 cgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M8_22 - + .align 5 cgemm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 + .align 5 cgemm_kernel_L4_M8_32: tst counterL, #1 ble cgemm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 @@ -1390,14 +1501,21 @@ cgemm_kernel_L4_M8_40: cgemm_kernel_L4_M8_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble cgemm_kernel_L4_M8_100 + .align 5 cgemm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne cgemm_kernel_L4_M8_46 + cgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4 diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index ce5cb0406..680fb56c3 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -46,20 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alphaR w17 +#define alphaI w18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] -#define alpha1_R s14 -#define alphaV1_R v14.s[0] -#define alpha1_I s15 -#define alphaV1_I v15.s[0] +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -124,14 +126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I -//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R -//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I -//v10 must save ALPHA0_R -//v11 must save ALPHA0_I -//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R -//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I -//v14 must save ALPHA1_R -//v15 must save ALPHA1_I +//v08 must save pB0_00_R, pB0_01_R +//v09 must save pB0_00_I, pB0_01_I +//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -149,6 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -173,8 +176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -191,6 +195,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -202,6 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -213,6 +223,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -224,56 +237,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -282,47 +298,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -331,47 +356,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -390,157 +422,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] - - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] - - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] - + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] + + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] + + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] + add pCRow0, pCRow0, #32 fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 - + add pCRow1, pCRow1, #32 fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] + add pCRow2, pCRow2, #32 fmul v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmul v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmul v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC - + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] + add pCRow3, pCRow3, #32 fmul v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmul v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmul v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -722,13 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -736,8 +780,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -745,8 +789,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -754,8 +798,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -802,13 +846,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -816,8 +863,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -825,8 +872,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmul v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmul v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -834,8 +881,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmul v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmul v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -882,13 +929,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -896,8 +946,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -905,8 +955,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmul s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmul s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -914,8 +964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmul s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmul s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -964,13 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -978,8 +1031,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -987,8 +1040,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -996,8 +1049,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1030,13 +1083,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1044,8 +1100,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1078,13 +1134,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1092,8 +1151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1126,13 +1185,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1140,8 +1202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1176,13 +1238,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1190,8 +1255,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1218,13 +1283,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1250,13 +1318,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1283,13 +1354,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1315,10 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1335,8 +1410,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ ctrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1370,40 +1450,64 @@ ctrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ctrmm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble ctrmm_kernel_L4_M8_22a - .align 5 + .align 5 ctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M8_22 - + .align 5 ctrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 + .align 5 ctrmm_kernel_L4_M8_32: tst counterL, #1 ble ctrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 @@ -1414,13 +1518,17 @@ ctrmm_kernel_L4_M8_40: ctrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ctrmm_kernel_L4_M8_100 + .align 5 ctrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne ctrmm_kernel_L4_M8_46 + ctrmm_kernel_L4_M8_100: SAVE8x4 @@ -1440,6 +1548,9 @@ ctrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] ctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 @@ -1454,9 +1565,8 @@ ctrmm_kernel_L4_M4_BEGIN: tst counterI, #4 ble ctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: - INIT4x4 +ctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1475,38 +1585,47 @@ ctrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble ctrmm_kernel_L4_M4_40 + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M4_32 -ctrmm_kernel_L4_M4_22: + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB + subs counterL, counterL, #2 + ble ctrmm_kernel_L4_M4_22a + .align 5 - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB + +ctrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M4_22 - +ctrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 +ctrmm_kernel_L4_M4_32: + tst counterL, #1 + ble ctrmm_kernel_L4_M4_40 + KERNEL4x4_I + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_40: - ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M4_100 + INIT4x4 -ctrmm_kernel_L4_M4_42: +ctrmm_kernel_L4_M4_44: + ands counterL , tempK, #1 + ble ctrmm_kernel_L4_M4_100 +ctrmm_kernel_L4_M4_46: KERNEL4x4_SUB - subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_42 - ctrmm_kernel_L4_M4_100: SAVE4x4 @@ -1528,7 +1647,6 @@ ctrmm_kernel_L4_M4_100: ctrmm_kernel_L4_M4_END: - ctrmm_kernel_L4_M2_BEGIN: mov counterI, origM diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index b06c7560d..2b8173715 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.d[0] + fmul v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmul v17.2d, v1.2d, v8.d[0] + fmul v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmul v24.2d, v0.2d, v10.d[0] + fmul v28.2d, v0.2d, v11.d[0] + + ldp q4, q5, [pA], #32 + + fmul v25.2d, v1.2d, v10.d[0] + fmul v29.2d, v1.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmul v18.2d, v2.2d, v8.d[0] - fmul v19.2d, v3.2d, v8.d[0] + fmul v22.2d, v2.2d, v9.d[0] - fmul v20.2d, v0.2d, v8.d[1] - fmul v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.d[1] - fmul v23.2d, v3.2d, v8.d[1] + ldp d14, d15, [pB], #16 - fmul v24.2d, v0.2d, v9.d[0] - fmul v25.2d, v1.2d, v9.d[0] - fmul v26.2d, v2.2d, v9.d[0] - fmul v27.2d, v3.2d, v9.d[0] + fmul v26.2d, v2.2d, v10.d[0] + fmul v30.2d, v2.2d, v11.d[0] - fmul v28.2d, v0.2d, v9.d[1] - fmul v29.2d, v1.2d, v9.d[1] - fmul v30.2d, v2.2d, v9.d[1] - fmul v31.2d, v3.2d, v9.d[1] + ldp q6, q7, [pA], #32 - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 + fmul v19.2d, v3.2d, v8.d[0] + fmul v27.2d, v3.2d, v10.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v31.2d, v3.2d, v11.d[0] + fmul v23.2d, v3.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp q4, q5, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] - fmla v18.2d, v2.2d, v8.d[0] - fmla v19.2d, v3.2d, v8.d[0] + fmla v25.2d, v1.2d, v10.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v29.2d, v1.2d, v11.d[0] - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] + ldp d14, d15, [pB], #16 - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] + fmla v23.2d, v3.2d, v9.d[0] + + ldp q6, q7, [pA], #32 - prfm PLDL1KEEP, [pA, #512] + fmla v27.2d, v3.2d, v10.d[0] + fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + + ldp q0, q1, [pA], #32 + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + + ldp d8, d9, [pB], #16 + + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + ldp d10, d11, [pB], #16 + fmla v18.2d, v6.2d, v12.d[0] - fmla v19.2d, v7.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q2, q3, [pA], #32 - prfm PLDL1KEEP, [pB, #512] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v17.2d, v5.2d, v12.d[0] - fmla v18.2d, v6.2d, v12.d[0] - fmla v19.2d, v7.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] + fmla v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + fmla v25.2d, v1.2d, v10.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.2d, v2.2d, v8.d[0] - fmla v19.2d, v3.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v19.2d, v3.2d, v8.d[0] + fmla v27.2d, v3.2d, v10.d[0] + + fmla v31.2d, v3.2d, v11.d[0] + fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmul v1.2d, v17.2d, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v21.2d, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.2d, v24.2d, alphaV0 - fmul v1.2d, v25.2d, alphaV1 - fmul v2.2d, v26.2d, alphaV2 - fmul v3.2d, v27.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] + fmul v1.2d, v25.2d, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + fmul v2.2d, v26.2d, alphaV0 + fmul v3.2d, v27.2d, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0 - fmul v5.2d, v29.2d, alphaV1 - fmul v6.2d, v30.2d, alphaV2 - fmul v7.2d, v31.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v29.2d, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + fmul v6.2d, v30.2d, alphaV0 + fmul v7.2d, v31.2d, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 - fmul v9.2d, v25.2d, alphaV1 + fmul v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV2 - fmul v13.2d, v29.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 + fmul v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC - fmul v8.2d, v24.2d, alphaV2 + fmul v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 + fmul v5.2d, v21.2d, alphaV0 + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul d8, d16, alpha0 str d8, [pCRow0] @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, d0 - fmov alpha1, d0 - fmov alpha2, d0 - fmov alpha3, d0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ dtrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dtrmm_kernel_L4_M4_BEGIN + .align 5 dtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 + asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dtrmm_kernel_L4_M8_22a - .align 5 + .align 5 dtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M8_22 - + .align 5 dtrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 + .align 5 dtrmm_kernel_L4_M8_32: tst counterL, #1 ble dtrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: dtrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble dtrmm_kernel_L4_M8_100 + .align 5 dtrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne dtrmm_kernel_L4_M8_46 + dtrmm_kernel_L4_M8_100: SAVE8x4 @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] dtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 68366d9f2..6e3645b76 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -46,16 +46,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -98,14 +99,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -147,206 +148,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] - fmla v18.4s, v6.4s, v12.s[0] - fmla v19.4s, v7.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v19.4s, v7.4s, v12.s[0] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + ldp q2, q3, [pCRow0] + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] + + add pCRow1, pCRow1, #32 + + ldp q6, q7, [pCRow1] + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + ldp q0, q1, [pCRow2] fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - fmla v2.4s, v26.4s, alphaV2 - fmla v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmla v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + add pCRow2, pCRow2, #32 + + ldp q2, q3, [pCRow2] + fmla v2.4s, v26.4s, alphaV0 + fmla v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - fmla v6.4s, v30.4s, alphaV2 - fmla v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + + ldp q6, q7, [pCRow3] + fmla v6.4s, v30.4s, alphaV0 + fmla v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -363,264 +407,217 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + ldp q2, q3, [pCRow1] + fmla v2.4s, v20.4s, alphaV0 + fmla v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 - ld1 {v0.4s, v1.4s}, [pCRow2] - fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + ldp q4, q5, [pCRow2] + fmla v4.4s, v24.4s, alphaV0 + fmla v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + add pCRow2, pCRow2, #32 - add pCRow0, pCRow0, #32 + ldp q6, q7, [pCRow3] + fmla v6.4s, v28.4s, alphaV0 + fmla v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] - - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - prfm PLDL1KEEP, [pB, #512] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] +.endm - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] +.macro KERNEL4x4_SUB + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] + ldr q0, [pA], #16 - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm -.macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 - - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] +.macro SAVE4x4 + fmov alpha0, alpha - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] + add pCRow0, pCRow0, #16 - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] -.endm + ldr q1, [pCRow1] + fmla v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] -.macro SAVE4x4 - ld1 {v8.2s, v9.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + add pCRow1, pCRow1, #16 - add pCRow1, pCRow0, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q2, [pCRow2] + fmla v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] - add pCRow2, pCRow1, LDC - ld1 {v8.2s, v9.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV0 - fmla v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] + add pCRow2, pCRow2, #16 - add pCRow1, pCRow2, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV2 - fmla v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q3, [pCRow3] + fmla v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] - add pCRow0, pCRow0, #16 + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -633,38 +630,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - ld1 {v8.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] + ldr d0, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] - add pCRow2, pCRow1, LDC - ld1 {v8.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] + add pCRow0, pCRow0, #8 - add pCRow1, pCRow2, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + ldr d1, [pCRow1] + fmla v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] - add pCRow0, pCRow0, #8 + add pCRow1, pCRow1, #8 + + ldr d0, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + ldr d1, [pCRow3] + fmla v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -686,22 +688,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC + add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 + ld1 {v12.s}[0], [pCRow2] - ld1 {v12.s}[1], [pCRow1] - fmla v12.2s, v20.2s, alphaV1 + ld1 {v12.s}[1], [pCRow3] + fmla v12.2s, v20.2s, alphaV0 st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] + st1 {v12.s}[1], [pCRow3] - add pCRow0, pCRow0, #4 + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -741,20 +746,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 + fmla v5.4s, v21.4s, alphaV0 + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -785,18 +792,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 + fmla v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -824,15 +833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 + fmla v12.2s, v20.2s, alphaV0 + fmla v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -857,13 +868,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 + fmla v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -886,6 +899,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] @@ -925,11 +940,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -956,9 +973,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -983,9 +1002,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -1008,6 +1029,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1032,6 +1055,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha + ldr s8, [pCRow0] fmla s8, s16, alphaV0 str s8, [pCRow0] @@ -1061,10 +1086,10 @@ sgemm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1078,8 +1103,12 @@ sgemm_kernel_begin: /******************************************************************************/ sgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1090,42 +1119,69 @@ sgemm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble sgemm_kernel_L4_M8_BEGIN + .align 5 sgemm_kernel_L4_M16_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt sgemm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble sgemm_kernel_L4_M16_22a - .align 5 + .align 5 sgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M16_22 + .align 5 sgemm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 + .align 5 sgemm_kernel_L4_M16_32: tst counterL, #1 ble sgemm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 @@ -1136,14 +1192,20 @@ sgemm_kernel_L4_M16_40: sgemm_kernel_L4_M16_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble sgemm_kernel_L4_M16_100 + .align 5 sgemm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne sgemm_kernel_L4_M16_46 sgemm_kernel_L4_M16_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE16x4 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 28b321651..77e05103d 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -46,19 +46,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -101,14 +102,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -150,202 +151,240 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] - fmla v18.4s, v6.4s, v12.s[0] - fmla v19.4s, v7.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v19.4s, v7.4s, v12.s[0] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] + + add pCRow1, pCRow1, #32 - add pCRow1, pCRow2, LDC + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - fmul v2.4s, v26.4s, alphaV2 - fmul v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmul v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + + fmul v2.4s, v26.4s, alphaV0 + fmul v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - fmul v6.4s, v30.4s, alphaV2 - fmul v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + fmul v6.4s, v30.4s, alphaV0 + fmul v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -362,260 +401,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + fmul v2.4s, v20.4s, alphaV0 + fmul v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 - fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + fmul v4.4s, v24.4s, alphaV0 + fmul v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] - fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + add pCRow2, pCRow2, #32 - add pCRow0, pCRow0, #32 + fmul v6.4s, v28.4s, alphaV0 + fmul v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] - - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - prfm PLDL1KEEP, [pB, #512] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 + fmov alpha0, alpha - fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + fmul v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] - add pCRow1, pCRow0, LDC + add pCRow0, pCRow0, #16 - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + fmul v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] - add pCRow2, pCRow1, LDC + add pCRow1, pCRow1, #16 - fmul v8.2s, v24.2s, alphaV0 - fmul v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] + fmul v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] - add pCRow1, pCRow2, LDC + add pCRow2, pCRow2, #16 - fmul v12.2s, v28.2s, alphaV2 - fmul v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + fmul v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] - add pCRow0, pCRow0, #16 + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -628,34 +616,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - fmul v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] + fmul v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] - add pCRow2, pCRow1, LDC - fmul v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] + add pCRow0, pCRow0, #8 - add pCRow1, pCRow2, LDC - fmul v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + fmul v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] - add pCRow0, pCRow0, #8 + add pCRow1, pCRow1, #8 + + fmul v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + fmul v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -677,20 +670,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC + add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 - fmul v12.2s, v20.2s, alphaV1 + fmul v12.2s, v20.2s, alphaV0 st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] + st1 {v12.s}[1], [pCRow3] - add pCRow0, pCRow0, #4 + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -730,18 +724,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 + fmul v5.4s, v21.4s, alphaV0 + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -772,16 +768,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 + fmul v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -809,15 +807,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 + fmul v12.2s, v20.2s, alphaV0 + fmul v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -842,12 +841,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 + fmul v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -870,6 +871,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 @@ -908,11 +911,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -939,9 +943,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -966,9 +971,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -991,6 +997,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1015,6 +1022,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul s8, s16, alpha0 str s8, [pCRow0] @@ -1043,10 +1051,10 @@ strmm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1063,8 +1071,13 @@ strmm_kernel_begin: /******************************************************************************/ strmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1078,6 +1091,7 @@ strmm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble strmm_kernel_L4_M8_BEGIN + .align 5 strmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,38 +1112,64 @@ strmm_kernel_L4_M16_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt strmm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble strmm_kernel_L4_M16_22a - .align 5 + .align 5 strmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M16_22 + .align 5 strmm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 + .align 5 strmm_kernel_L4_M16_32: tst counterL, #1 ble strmm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 @@ -1140,12 +1180,15 @@ strmm_kernel_L4_M16_40: strmm_kernel_L4_M16_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble strmm_kernel_L4_M16_100 + .align 5 strmm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne strmm_kernel_L4_M16_46 strmm_kernel_L4_M16_100: @@ -1166,6 +1209,9 @@ strmm_kernel_L4_M16_100: #if defined(LEFT) add tempOffset, tempOffset, #16 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] strmm_kernel_L4_M16_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 1cb695e56..08a1531cf 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I // 19 must save // 20 must save // 21 must save @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2 {v0.2d, v1.2d}, [pCRow1] + ld2 {v0.2d, v1.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2 {v2.2d, v3.2d}, [pCRow0] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, #32 + + ld2 {v6.2d, v7.2d}, [pCRow1] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] - add pCRow1, pCRow1, LDC - ld2 {v0.2d, v1.2d}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ld2 {v0.2d, v1.2d}, [pCRow2] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmla v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmla v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.2d, v5.2d}, [pCRow1] + ld2 {v4.2d, v5.2d}, [pCRow3] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + ld2 {v6.2d, v7.2d}, [pCRow3] fmla v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmla v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmla d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmla d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmla d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmla d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow2] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble zgemm_kernel_L2_BEGIN zgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + mov pA, origPA // pA = start of A array zgemm_kernel_L4_M4_BEGIN: @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble zgemm_kernel_L4_M2_BEGIN + .align 5 zgemm_kernel_L4_M4_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt zgemm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 ble zgemm_kernel_L4_M4_22a - .align 5 + .align 5 zgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt zgemm_kernel_L4_M4_22 - + .align 5 zgemm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 + .align 5 zgemm_kernel_L4_M4_32: tst counterL, #1 ble zgemm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: zgemm_kernel_L4_M4_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble zgemm_kernel_L4_M4_100 + .align 5 zgemm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne zgemm_kernel_L4_M4_46 + zgemm_kernel_L4_M4_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE4x4 diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 7945870d6..77a7857ff 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 -#define temp x18 -#define tempOffset x19 -#define tempK x20 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 04 origPB // 05 pC // 06 origLDC -> LDC -// 07 offset +// 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save temp -// 19 must save tempOffset -// 20 must save tempK -// 21 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save temp +// 20 must save tempOffset +// 21 must save tempK // 22 must save // 23 must save // 24 must save @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + + add pCRow1, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + fmul v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmul v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmul v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + fmul v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmul v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmul d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmul d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmul d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmul d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble ztrmm_kernel_L2_BEGIN ztrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble ztrmm_kernel_L4_M2_BEGIN + .align 5 ztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ztrmm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 ble ztrmm_kernel_L4_M4_22a - .align 5 + .align 5 ztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M4_22 - + .align 5 ztrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 + .align 5 ztrmm_kernel_L4_M4_32: tst counterL, #1 ble ztrmm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: ztrmm_kernel_L4_M4_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ztrmm_kernel_L4_M4_100 + .align 5 ztrmm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne ztrmm_kernel_L4_M4_46 + ztrmm_kernel_L4_M4_100: SAVE4x4 @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + ztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne ztrmm_kernel_L4_M4_20 diff --git a/param.h b/param.h index fdc9d1104..7635cb8fc 100644 --- a/param.h +++ b/param.h @@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4 From 9b69d8a8e5c6d3d4ccf95413cbf79bc4ad58bb87 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Jul 2016 11:41:57 +0200 Subject: [PATCH 56/70] Update zgetrf2.f Trivial typo correction (ZERBLA => XERBLA) to fix #910 --- lapack-netlib/SRC/zgetrf2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zgetrf2.f b/lapack-netlib/SRC/zgetrf2.f index 290d4847e..7d28b5812 100644 --- a/lapack-netlib/SRC/zgetrf2.f +++ b/lapack-netlib/SRC/zgetrf2.f @@ -144,7 +144,7 @@ EXTERNAL DLAMCH, IZAMAX * .. * .. External Subroutines .. - EXTERNAL ZGEMM, ZSCAL, ZLASWP, ZTRSM, ZERBLA + EXTERNAL ZGEMM, ZSCAL, ZLASWP, ZTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN From 7de829f713dcf590c2877ca5e9674ff5a3060852 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Jul 2016 12:22:55 +0200 Subject: [PATCH 57/70] Update dynamic.c Add Braswell (extended model 4, model 12) N3150 as Nehalem --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9e8cce438..18f85c316 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -261,8 +261,8 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - //Intel Avoton - if (model == 13) { + //Intel Braswell / Avoton + if (model == 12 || model == 13) { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } From 97bd1e42c87ed72bcc2d7d124a679b8fcd67642d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Jul 2016 12:25:17 +0200 Subject: [PATCH 58/70] Update cpuid_x86.c --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index e5938803d..4ce6a96f0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1678,6 +1678,8 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 12: + // Braswell case 13: // Avoton return CORE_NEHALEM; From 154729908e8450d2e7d0523354b6c34ebf4b12f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Jul 2016 17:29:34 +0200 Subject: [PATCH 59/70] Update cpuid_x86.c --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4ce6a96f0..bbd377f67 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1172,6 +1172,8 @@ int get_cpuname(void){ #endif else return CPUTYPE_NEHALEM; + case 12: + // Braswell case 13: // Avoton return CPUTYPE_NEHALEM; From 7f28cd1f88145a701e5dbbf50558bb65fce79f61 Mon Sep 17 00:00:00 2001 From: Vicente Olivert Riera Date: Thu, 14 Jul 2016 17:20:51 +0100 Subject: [PATCH 60/70] Complete support for MIPS n32 ABI Signed-off-by: Vicente Olivert Riera --- Makefile.system | 27 +++++++++------------------ c_check | 9 +++++++-- f_check | 7 ++++++- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/Makefile.system b/Makefile.system index 24a7a6406..bbcdb8240 100644 --- a/Makefile.system +++ b/Makefile.system @@ -502,13 +502,16 @@ endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) +ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 CCOMMON_OPT += -mabi=64 else -CCOMMON_OPT += -mabi=32 +CCOMMON_OPT += -mabi=n32 endif BINARY_DEFINED = 1 +else ifeq ($(ARCH), $(filter $(ARCH),mips)) +CCOMMON_OPT += -mabi=32 +BINARY_DEFINED = 1 endif ifeq ($(CORE), LOONGSON3A) @@ -599,12 +602,14 @@ ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) +ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 FCOMMON_OPT += -mabi=64 else -FCOMMON_OPT += -mabi=32 +FCOMMON_OPT += -mabi=n32 endif +else ifeq ($(ARCH), $(filter $(ARCH),mips)) +FCOMMON_OPT += -mabi=32 endif else ifdef BINARY64 @@ -688,20 +693,6 @@ endif endif endif -ifeq ($(filter $(ARCH),mips64 mips)) -ifndef BINARY64 -FCOMMON_OPT += -m32 -else -FCOMMON_OPT += -m64 -endif -else -ifdef BINARY64 -FCOMMON_OPT += -mabi=64 -else -FCOMMON_OPT += -mabi=32 -endif -endif - ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif diff --git a/c_check b/c_check index 50ff360a2..9f457dfec 100644 --- a/c_check +++ b/c_check @@ -79,8 +79,13 @@ if ($os eq "AIX") { $defined = 1; } -if (($architecture eq "mips") || ($architecture eq "mips64")) { - $compiler_name .= " -mabi=32" if ($binary eq "32"); +if ($architecture eq "mips") { + $compiler_name .= " -mabi=32"; + $defined = 1; +} + +if ($architecture eq "mips64") { + $compiler_name .= " -mabi=n32" if ($binary eq "32"); $compiler_name .= " -mabi=64" if ($binary eq "64"); $defined = 1; } diff --git a/f_check b/f_check index 4c03ac768..3520e8b06 100644 --- a/f_check +++ b/f_check @@ -223,7 +223,12 @@ if (!$?) { } #For gfortran MIPS if ($?) { - $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + $mips_data = `$compiler_bin -E -dM - < /dev/null`; + if ($mips_data =~ /_MIPS_ISA_MIPS64/) { + $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } else { + $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } } $binary = "" if ($?); } From 9e44f3ddd0abcb3f3a8bfa6518d77ac902337b62 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 14 Jul 2016 13:09:36 -0700 Subject: [PATCH 61/70] Refs #917 Avoid detecting gfortran bug on IBM POWER + Ubuntu --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index 3520e8b06..c7ad964e0 100644 --- a/f_check +++ b/f_check @@ -114,7 +114,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($data =~ /IBM/) { + if ($data =~ /IBM XL/) { $vendor = IBM; $openmp = "-openmp"; } From beb1d076a469a5a4a72c76b7eb3bf1ccf3e64a61 Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Fri, 15 Jul 2016 18:38:25 +0530 Subject: [PATCH 62/70] Added MSA optimization for GEMV_N, GEMV_T, ASUM, DOT functions Signed-off-by: Shivraj Patil --- Makefile.system | 2 +- TargetList.txt | 1 + cpuid_mips64.c | 17 +- getarch.c | 15 + kernel/mips/KERNEL.P5600 | 32 +- kernel/mips/casum_msa.c | 338 +++++++++++++++++++ kernel/mips/cdot_msa.c | 361 ++++++++++++++++++++ kernel/mips/cgemv_n_msa.c | 611 +++++++++++++++++++++++++++++++++ kernel/mips/cgemv_t_msa.c | 583 ++++++++++++++++++++++++++++++++ kernel/mips/dasum_msa.c | 278 ++++++++++++++++ kernel/mips/ddot_msa.c | 189 +++++++++++ kernel/mips/dgemv_n_msa.c | 577 ++++++++++++++++++++++++++++++++ kernel/mips/dgemv_t_msa.c | 589 ++++++++++++++++++++++++++++++++ kernel/mips/sasum_msa.c | 333 ++++++++++++++++++ kernel/mips/sdot_msa.c | 208 ++++++++++++ kernel/mips/sgemv_n_msa.c | 515 ++++++++++++++++++++++++++++ kernel/mips/sgemv_t_msa.c | 463 +++++++++++++++++++++++++ kernel/mips/zasum_msa.c | 170 ++++++++++ kernel/mips/zdot_msa.c | 227 +++++++++++++ kernel/mips/zgemv_n_msa.c | 667 +++++++++++++++++++++++++++++++++++++ kernel/mips/zgemv_t_msa.c | 544 ++++++++++++++++++++++++++++++ kernel/mips64/KERNEL.P6600 | 1 + param.h | 4 +- 23 files changed, 6705 insertions(+), 20 deletions(-) create mode 100644 kernel/mips/casum_msa.c create mode 100644 kernel/mips/cdot_msa.c create mode 100644 kernel/mips/cgemv_n_msa.c create mode 100644 kernel/mips/cgemv_t_msa.c create mode 100644 kernel/mips/dasum_msa.c create mode 100644 kernel/mips/ddot_msa.c create mode 100644 kernel/mips/dgemv_n_msa.c create mode 100644 kernel/mips/dgemv_t_msa.c create mode 100644 kernel/mips/sasum_msa.c create mode 100644 kernel/mips/sdot_msa.c create mode 100644 kernel/mips/sgemv_n_msa.c create mode 100644 kernel/mips/sgemv_t_msa.c create mode 100644 kernel/mips/zasum_msa.c create mode 100644 kernel/mips/zdot_msa.c create mode 100644 kernel/mips/zgemv_n_msa.c create mode 100644 kernel/mips/zgemv_t_msa.c create mode 100644 kernel/mips64/KERNEL.P6600 diff --git a/Makefile.system b/Makefile.system index bbcdb8240..1c48a251c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -529,7 +529,7 @@ CCOMMON_OPT += -mmsa FCOMMON_OPT += -mmsa endif -ifeq ($(CORE), I6400) +ifneq ($(filter $(CORE), I6400 P6600),) CCOMMON_OPT += -mmsa FCOMMON_OPT += -mmsa endif diff --git a/TargetList.txt b/TargetList.txt index 248f643a7..52a60b49c 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -61,6 +61,7 @@ SICORTEX LOONGSON3A LOONGSON3B I6400 +P6600 5.IA64 CPU: ITANIUM2 diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 13f1517d5..ac1554c79 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -75,13 +75,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_LOONGSON3A 2 #define CPU_LOONGSON3B 3 #define CPU_I6400 4 +#define CPU_P6600 5 static char *cpuname[] = { "UNKOWN", "SICORTEX", "LOONGSON3A", "LOONGSON3B", - "I6400" + "I6400", + "P6600" }; int detect(void){ @@ -161,6 +163,8 @@ void get_subarchitecture(void){ printf("LOONGSON3B"); }else if(detect()==CPU_I6400){ printf("I6400"); + }else if(detect()==CPU_P6600){ + printf("P6600"); }else{ printf("SICORTEX"); } @@ -198,6 +202,15 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); + }else if(detect()==CPU_P6600){ + printf("#define P6600\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); }else{ printf("#define SICORTEX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -217,6 +230,8 @@ void get_libname(void){ printf("loongson3b\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); + }else if(detect()==CPU_P6600) { + printf("p6600\n"); }else{ printf("mips64\n"); } diff --git a/getarch.c b/getarch.c index 2f5d18a01..f8069e507 100644 --- a/getarch.c +++ b/getarch.c @@ -132,6 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_LOONGSON3A */ /* #define FORCE_LOONGSON3B */ /* #define FORCE_I6400 */ +/* #define FORCE_P6600 */ /* #define FORCE_P5600 */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_SPARC */ @@ -715,6 +716,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_P6600 +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "P6600" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DP6600 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "p6600" +#define CORENAME "P6600" +#else +#endif + #ifdef FORCE_P5600 #define FORCE #define ARCHITECTURE "MIPS" diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 5d8bcb9ec..7bf90c905 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -30,10 +30,10 @@ IDMAXKERNEL = ../mips/imax.c ISMINKERNEL = ../mips/imin.c IDMINKERNEL = ../mips/imin.c -SASUMKERNEL = ../mips/asum.c -DASUMKERNEL = ../mips/asum.c -CASUMKERNEL = ../mips/zasum.c -ZASUMKERNEL = ../mips/zasum.c +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c SAXPYKERNEL = ../mips/axpy.c DAXPYKERNEL = ../mips/axpy.c @@ -45,10 +45,10 @@ DCOPYKERNEL = ../mips/copy.c CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c -SDOTKERNEL = ../mips/dot.c -DDOTKERNEL = ../mips/dot.c -CDOTKERNEL = ../mips/zdot.c -ZDOTKERNEL = ../mips/zdot.c +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c SNRM2KERNEL = ../mips/nrm2.c DNRM2KERNEL = ../mips/nrm2.c @@ -70,15 +70,15 @@ DSWAPKERNEL = ../mips/swap.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c -SGEMVNKERNEL = ../mips/gemv_n.c -DGEMVNKERNEL = ../mips/gemv_n.c -CGEMVNKERNEL = ../mips/zgemv_n.c -ZGEMVNKERNEL = ../mips/zgemv_n.c +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c -SGEMVTKERNEL = ../mips/gemv_t.c -DGEMVTKERNEL = ../mips/gemv_t.c -CGEMVTKERNEL = ../mips/zgemv_t.c -ZGEMVTKERNEL = ../mips/zgemv_t.c +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c diff --git a/kernel/mips/casum_msa.c b/kernel/mips/casum_msa.c new file mode 100644 index 000000000..454573d56 --- /dev/null +++ b/kernel/mips/casum_msa.c @@ -0,0 +1,338 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include +#include "macros_msa.h" + +#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i, inc_x2; + FLOAT sumf = 0.0; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v4f32 zero_v = {0}; + v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + if (n > 15) + { + n -= 16; + + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src1); + sum_abs2 = AND_VEC_W(src2); + sum_abs3 = AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 4); i--;) + { + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 8) && (n & 4)) + { + LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 8) && (n & 2)) + { + LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 4) && (n & 2)) + { + LD_SP3_INC(x, 4, src0, src1, src2); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 8) + { + LD_SP4_INC(x, 4, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 4) + { + LD_SP2_INC(x, 4, src0, src1); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 2) + { + src0 = LD_SP(x); x += 4; + + sum_abs0 += AND_VEC_W(src0); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + + if (n & 1) + { + sumf += fabsf(*(x + 0)); + sumf += fabsf(*(x + 1)); + } + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + } + else + { + inc_x2 = 2 * inc_x; + + if (n > 8) + { + n -= 8; + + LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src1); + sum_abs2 = AND_VEC_W(src2); + sum_abs3 = AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 3); i--;) + { + LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + } + else if ((n & 4) && (n & 2)) + { + LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + } + else if ((n & 4) && (n & 1)) + { + LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + } + else if ((n & 2) && (n & 1)) + { + LD_SP3_INC(x, inc_x2, src0, src1, src2); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + } + else if (n & 4) + { + LD_SP4_INC(x, inc_x2, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + } + else if (n & 2) + { + LD_SP2_INC(x, inc_x2, src0, src1); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + } + else if (n & 1) + { + src0 = LD_SP(x); x += inc_x2; + + sum_abs0 += AND_VEC_W(src0); + } + } + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0] + sum_abs0[1]; + } + + return (sumf); +} diff --git a/kernel/mips/cdot_msa.c b/kernel/mips/cdot_msa.c new file mode 100644 index 000000000..bf9f6b7e2 --- /dev/null +++ b/kernel/mips/cdot_msa.c @@ -0,0 +1,361 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP2 += + #define OP3 - + #define OP4 + +#else + #define OP2 -= + #define OP3 + + #define OP4 - +#endif + +#define DOT16_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); \ + \ + dot0 += (vx3r * vy3r); \ + dot0 OPR0## = (vx3i * vy3i); \ + dot1 OPR1## = (vx3i * vy3r); \ + dot1 += (vx3r * vy3i); + +#define DOT12_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); + +#define DOT8_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); + +#define DOT4_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); + +/* return float, x,y float */ +/* cdotc - CONJ */ +/* cdotu - !CONJ */ +#ifndef _MSC_VER +#include +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + FLOAT dot[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7; + FLOAT y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; + v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; + v4f32 dot0 = {0, 0, 0, 0}; + v4f32 dot1 = {0, 0, 0, 0}; + openblas_complex_float result; + + dot[0] = 0.0; + dot[1] = 0.0; + + __real__(result) = 0.0; + __imag__(result) = 0.0; + + if ( n < 1 ) return(result); + + if ((1 == inc_x) && (1 == inc_y)) + { + for (i = (n >> 4); i--;) + { + LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); + PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); + PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); + + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); + PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); + PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); + + #if !defined(CONJ) + DOT16_KERNEL(-, +); + #else + DOT16_KERNEL(+, -); + #endif + } + + if (n & 15) + { + if ((n & 8) && (n & 4)) + { + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); + LD_SP2_INC(x, 4, vx4, vx5); + LD_SP2_INC(y, 4, vy4, vy5); + + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); + PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); + + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); + PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); + + #if !defined(CONJ) + DOT12_KERNEL(-, +); + #else + DOT12_KERNEL(+, -); + #endif + } + else if (n & 8) + { + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); + + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); + + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); + + #if !defined(CONJ) + DOT8_KERNEL(-, +); + #else + DOT8_KERNEL(+, -); + #endif + } + else if (n & 4) + { + LD_SP2_INC(x, 4, vx0, vx1); + LD_SP2_INC(y, 4, vy0, vy1); + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + + #if !defined(CONJ) + DOT4_KERNEL(-, +); + #else + DOT4_KERNEL(+, -); + #endif + } + + if ((n & 2) && (n & 1)) + { + LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5); + LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5); + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + + dot[0] += ( x4 * y4 OP3 x5 * y5 ); + dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); + } + else if (n & 2) + { + LD_GP4_INC(x, 1, x0, x1, x2, x3); + LD_GP4_INC(y, 1, y0, y1, y2, y3); + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + } + else if (n & 1) + { + LD_GP2_INC(x, 1, x0, x1); + LD_GP2_INC(y, 1, y0, y1); + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + } + } + + dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); + dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); + } + else + { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + for (i = (n >> 2); i--;) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; + x4 = *x; + x5 = *(x + 1); + x += inc_x2; + x6 = *x; + x7 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; + y4 = *y; + y5 = *(y + 1); + y += inc_y2; + y6 = *y; + y7 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + + dot[0] += ( x4 * y4 OP3 x5 * y5 ); + dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); + + dot[0] += ( x6 * y6 OP3 x7 * y7 ); + dot[1] OP2 ( x7 * y6 OP4 x6 * y7 ); + } + + if ((n & 2) && (n & 1)) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; + x4 = *x; + x5 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; + y4 = *y; + y5 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + + dot[0] += ( x4 * y4 OP3 x5 * y5 ); + dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); + } + else if (n & 2) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + + dot[0] += ( x2 * y2 OP3 x3 * y3 ); + dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + } + else if (n & 1) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + + dot[0] += ( x0 * y0 OP3 x1 * y1 ); + dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + } + } + + __real__(result) = dot[0]; + __imag__(result) = dot[1]; + + return(result); +} diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c new file mode 100644 index 000000000..f1879ba00 --- /dev/null +++ b/kernel/mips/cgemv_n_msa.c @@ -0,0 +1,611 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 + +#if !defined(XCONJ) + #define OP3 -= + #define OP4 += +#else + #define OP3 += + #define OP4 -= +#endif + +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif +#else + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 -= + #else + #define OP0 -= + #define OP1 -= + #define OP2 += + #endif +#endif + +#define CGEMV_N_8x4() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ + LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ + \ + y0r += tp0r * src0r; \ + y1r += tp0r * src1r; \ + y0r += tp1r * src2r; \ + y1r += tp1r * src3r; \ + y0r += tp2r * src4r; \ + y1r += tp2r * src5r; \ + y0r += tp3r * src6r; \ + y1r += tp3r * src7r; \ + \ + y0r OP0 tp0i * src0i; \ + y1r OP0 tp0i * src1i; \ + y0r OP0 tp1i * src2i; \ + y1r OP0 tp1i * src3i; \ + y0r OP0 tp2i * src4i; \ + y1r OP0 tp2i * src5i; \ + y0r OP0 tp3i * src6i; \ + y1r OP0 tp3i * src7i; \ + \ + y0i OP1 tp0r * src0i; \ + y1i OP1 tp0r * src1i; \ + y0i OP1 tp1r * src2i; \ + y1i OP1 tp1r * src3i; \ + y0i OP1 tp2r * src4i; \ + y1i OP1 tp2r * src5i; \ + y0i OP1 tp3r * src6i; \ + y1i OP1 tp3r * src7i; \ + \ + y0i OP2 tp0i * src0r; \ + y1i OP2 tp0i * src1r; \ + y0i OP2 tp1i * src2r; \ + y1i OP2 tp1i * src3r; \ + y0i OP2 tp2i * src4r; \ + y1i OP2 tp2i * src5r; \ + y0i OP2 tp3i * src6r; \ + y1i OP2 tp3i * src7r; \ + +#define CGEMV_N_4x4() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + LD_SP2(pa2 + k, 4, t8, t9); \ + LD_SP2(pa3 + k, 4, t12, t13); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + \ + y0r += tp0r * src0r; \ + y0r += tp1r * src2r; \ + y0r += tp2r * src4r; \ + y0r += tp3r * src6r; \ + \ + y0r OP0 tp0i * src0i; \ + y0r OP0 tp1i * src2i; \ + y0r OP0 tp2i * src4i; \ + y0r OP0 tp3i * src6i; \ + \ + y0i OP1 tp0r * src0i; \ + y0i OP1 tp1r * src2i; \ + y0i OP1 tp2r * src4i; \ + y0i OP1 tp3r * src6i; \ + \ + y0i OP2 tp0i * src0r; \ + y0i OP2 tp1i * src2r; \ + y0i OP2 tp2i * src4r; \ + y0i OP2 tp3i * src6r; \ + +#define CGEMV_N_1x4() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp0_r * pa0[k]; \ + res0 OP0 temp0_i * pa0[k + 1]; \ + res0 += temp1_r * pa1[k]; \ + res0 OP0 temp1_i * pa1[k + 1]; \ + res0 += temp2_r * pa2[k]; \ + res0 OP0 temp2_i * pa2[k + 1]; \ + res0 += temp3_r * pa3[k]; \ + res0 OP0 temp3_i * pa3[k + 1]; \ + \ + res1 OP1 temp0_r * pa0[k + 1]; \ + res1 OP2 temp0_i * pa0[k]; \ + res1 OP1 temp1_r * pa1[k + 1]; \ + res1 OP2 temp1_i * pa1[k]; \ + res1 OP1 temp2_r * pa2[k + 1]; \ + res1 OP2 temp2_i * pa2[k]; \ + res1 OP1 temp3_r * pa3[k + 1]; \ + res1 OP2 temp3_i * pa3[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define CGEMV_N_8x2() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + \ + y0r += tp0r * src0r; \ + y1r += tp0r * src1r; \ + y0r += tp1r * src2r; \ + y1r += tp1r * src3r; \ + \ + y0r OP0 tp0i * src0i; \ + y1r OP0 tp0i * src1i; \ + y0r OP0 tp1i * src2i; \ + y1r OP0 tp1i * src3i; \ + \ + y0i OP1 tp0r * src0i; \ + y1i OP1 tp0r * src1i; \ + y0i OP1 tp1r * src2i; \ + y1i OP1 tp1r * src3i; \ + \ + y0i OP2 tp0i * src0r; \ + y1i OP2 tp0i * src1r; \ + y0i OP2 tp1i * src2r; \ + y1i OP2 tp1i * src3r; \ + +#define CGEMV_N_4x2() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + \ + y0r += tp0r * src0r; \ + y0r += tp1r * src2r; \ + \ + y0r OP0 tp0i * src0i; \ + y0r OP0 tp1i * src2i; \ + \ + y0i OP1 tp0r * src0i; \ + y0i OP1 tp1r * src2i; \ + \ + y0i OP2 tp0i * src0r; \ + y0i OP2 tp1i * src2r; \ + +#define CGEMV_N_1x2() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp0_r * pa0[k]; \ + res0 OP0 temp0_i * pa0[k + 1]; \ + res0 += temp1_r * pa1[k]; \ + res0 OP0 temp1_i * pa1[k + 1]; \ + \ + res1 OP1 temp0_r * pa0[k + 1]; \ + res1 OP2 temp0_i * pa0[k]; \ + res1 OP1 temp1_r * pa1[k + 1]; \ + res1 OP2 temp1_i * pa1[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define CGEMV_N_1x1() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp_r * pa0[k]; \ + res0 OP0 temp_i * pa0[k + 1]; \ + \ + res1 OP1 temp_r * pa0[k + 1]; \ + res1 OP2 temp_i * pa0[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define CLOAD_X4_SCALE_VECTOR() \ + LD_SP2(x, 4, x0, x1); \ + \ + PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ + SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ + +#define CLOAD_X4_SCALE_GP() \ + x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ + x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ + SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ + +#define CLOAD_X2_SCALE_GP() \ + temp0_r = alpha_r * x[0 * inc_x2]; \ + temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ + temp0_i = alpha_r * x[0 * inc_x2 + 1]; \ + temp0_i OP4 alpha_i * x[0 * inc_x2]; \ + \ + temp1_r = alpha_r * x[1 * inc_x2]; \ + temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \ + temp1_i = alpha_r * x[1 * inc_x2 + 1]; \ + temp1_i OP4 alpha_i * x[1 * inc_x2]; \ + \ + tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \ + tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \ + tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \ + tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \ + +#define CLOAD_X1_SCALE_GP() \ + temp_r = alpha_r * x[0 * inc_x2]; \ + temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ + temp_i = alpha_r * x[0 * inc_x2 + 1]; \ + temp_i OP4 alpha_i * x[0 * inc_x2]; \ + +#define CLOAD_Y8_VECTOR() \ + LD_SP4(y, 4, y0, y1, y2, y3); \ + PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ + PCKEVOD_W2_SP(y3, y2, y1r, y1i); \ + +#define CLOAD_Y4_VECTOR() \ + LD_SP2(y, 4, y0, y1); \ + PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ + +#define CSTORE_Y8_VECTOR() \ + ILVRL_W2_SP(y0i, y0r, y0, y1); \ + ILVRL_W2_SP(y1i, y1r, y2, y3); \ + ST_SP4(y0, y1, y2, y3, y, 4); \ + +#define CSTORE_Y4_VECTOR() \ + ILVRL_W2_SP(y0i, y0r, y0, y1); \ + ST_SP2(y0, y1, y, 4); \ + +#define CLOAD_Y8_GP() \ + y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \ + y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \ + y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \ + y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \ + +#define CLOAD_Y4_GP() \ + y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ + y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ + y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ + y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ + +#define CSTORE_Y8_GP() \ + *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ + *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ + *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ + *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ + *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \ + *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \ + *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \ + *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \ + *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ + *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ + *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ + *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ + *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \ + *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \ + *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \ + *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \ + +#define CSTORE_Y4_GP() \ + *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ + *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ + *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ + *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ + *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ + *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ + *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ + *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ + +#define CGEMV_N_MSA() \ + for (j = (n >> 2); j--;) \ + { \ + CLOAD_X4_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_Y8() \ + CGEMV_N_8x4(); \ + CSTORE_Y8(); \ + \ + k += 2 * 8; \ + y += inc_y2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_Y4(); \ + CGEMV_N_4x4(); \ + CSTORE_Y4(); \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0_r = tp4r[0]; \ + temp1_r = tp4r[1]; \ + temp2_r = tp4r[2]; \ + temp3_r = tp4r[3]; \ + \ + temp0_i = tp4i[0]; \ + temp1_i = tp4i[1]; \ + temp2_i = tp4i[2]; \ + temp3_i = tp4i[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_N_1x4(); \ + \ + k += 2; \ + y += inc_y2; \ + } \ + } \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + \ + x += 4 * inc_x2; \ + } \ + \ + if (n & 2) \ + { \ + CLOAD_X2_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_Y8(); \ + CGEMV_N_8x2(); \ + CSTORE_Y8(); \ + \ + k += 2 * 8; \ + y += inc_y2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_Y4(); \ + CGEMV_N_4x2(); \ + CSTORE_Y4(); \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_N_1x2(); \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + \ + x += 2 * inc_x2; \ + } \ + \ + if (n & 1) \ + { \ + CLOAD_X1_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = m; i--;) \ + { \ + CGEMV_N_1x1(); \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += lda2; \ + x += inc_x2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, + BLASLONG inc_y2, FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT temp_r, temp_i, res0, res1, temp0_r; + FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i; + v4f32 alphar, alphai; + v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i; + + lda2 = 2 * lda2; + inc_x2 = 2 * inc_x2; + inc_y2 = 2 * inc_y2; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + alphar = COPY_FLOAT_TO_VECTOR(alpha_r); + alphai = COPY_FLOAT_TO_VECTOR(alpha_i); + + if ((2 == inc_x2) && (2 == inc_y2)) + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_VECTOR + #define CLOAD_Y4 CLOAD_Y4_VECTOR + #define CSTORE_Y8 CSTORE_Y8_VECTOR + #define CSTORE_Y4 CSTORE_Y4_VECTOR + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + else if (2 == inc_x2) + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_GP + #define CLOAD_Y4 CLOAD_Y4_GP + #define CSTORE_Y8 CSTORE_Y8_GP + #define CSTORE_Y4 CSTORE_Y4_GP + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + else if (2 == inc_y2) + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_VECTOR + #define CLOAD_Y4 CLOAD_Y4_VECTOR + #define CSTORE_Y8 CSTORE_Y8_VECTOR + #define CSTORE_Y4 CSTORE_Y4_VECTOR + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + else + { + #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP + #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP + #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP + #define CLOAD_Y8 CLOAD_Y8_GP + #define CLOAD_Y4 CLOAD_Y4_GP + #define CSTORE_Y8 CSTORE_Y8_GP + #define CSTORE_Y4 CSTORE_Y4_GP + + CGEMV_N_MSA(); + + #undef CLOAD_X4_SCALE + #undef CLOAD_X2_SCALE + #undef CLOAD_X1_SCALE + #undef CLOAD_Y8 + #undef CLOAD_Y4 + #undef CSTORE_Y8 + #undef CSTORE_Y4 + } + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c new file mode 100644 index 000000000..b9620bfb9 --- /dev/null +++ b/kernel/mips/cgemv_t_msa.c @@ -0,0 +1,583 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#undef OP0 +#undef OP1 +#undef OP2 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + #define OP0 -= + #define OP1 += + #define OP2 += +#else + #define OP0 += + #define OP1 += + #define OP2 -= +#endif + +#define CGEMV_T_8x4() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ + LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp1r += src2r * x0r; \ + tp1r += src3r * x1r; \ + tp1r OP0 src2i * x0i; \ + tp1r OP0 src3i * x1i; \ + \ + tp2r += src4r * x0r; \ + tp2r += src5r * x1r; \ + tp2r OP0 src4i * x0i; \ + tp2r OP0 src5i * x1i; \ + \ + tp3r += src6r * x0r; \ + tp3r += src7r * x1r; \ + tp3r OP0 src6i * x0i; \ + tp3r OP0 src7i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP1 src3r * x1i; \ + tp1i OP2 src2i * x0r; \ + tp1i OP2 src3i * x1r; \ + \ + tp2i OP1 src4r * x0i; \ + tp2i OP1 src5r * x1i; \ + tp2i OP2 src4i * x0r; \ + tp2i OP2 src5i * x1r; \ + \ + tp3i OP1 src6r * x0i; \ + tp3i OP1 src7r * x1i; \ + tp3i OP2 src6i * x0r; \ + tp3i OP2 src7i * x1r; \ + +#define CGEMV_T_8x2() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp1r += src2r * x0r; \ + tp1r += src3r * x1r; \ + tp1r OP0 src2i * x0i; \ + tp1r OP0 src3i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP1 src3r * x1i; \ + tp1i OP2 src2i * x0r; \ + tp1i OP2 src3i * x1r; \ + +#define CGEMV_T_8x1() \ + LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + +#define CGEMV_T_4x4() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + LD_SP2(pa2 + k, 4, t8, t9); \ + LD_SP2(pa3 + k, 4, t12, t13); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ + PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp1r += src2r * x0r; \ + tp1r OP0 src2i * x0i; \ + \ + tp2r += src4r * x0r; \ + tp2r OP0 src4i * x0i; \ + \ + tp3r += src6r * x0r; \ + tp3r OP0 src6i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP2 src2i * x0r; \ + \ + tp2i OP1 src4r * x0i; \ + tp2i OP2 src4i * x0r; \ + \ + tp3i OP1 src6r * x0i; \ + tp3i OP2 src6i * x0r; \ + +#define CGEMV_T_4x2() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t4, t5); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp1r += src2r * x0r; \ + tp1r OP0 src2i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP2 src2i * x0r; \ + +#define CGEMV_T_4x1() \ + LD_SP2(pa0 + k, 4, t0, t1); \ + \ + PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + +#define CGEMV_T_1x4() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + temp1r += pa1[k + 0] * x[0 * inc_x2]; \ + temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ + temp2r += pa2[k + 0] * x[0 * inc_x2]; \ + temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \ + temp3r += pa3[k + 0] * x[0 * inc_x2]; \ + temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ + temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ + temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \ + temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \ + temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \ + temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \ + +#define CGEMV_T_1x2() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + temp1r += pa1[k + 0] * x[0 * inc_x2]; \ + temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ + temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ + +#define CGEMV_T_1x1() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + +#define CSCALE_STORE_Y4_GP() \ + res0r = y[0 * inc_y2]; \ + res1r = y[1 * inc_y2]; \ + res2r = y[2 * inc_y2]; \ + res3r = y[3 * inc_y2]; \ + \ + res0i = y[0 * inc_y2 + 1]; \ + res1i = y[1 * inc_y2 + 1]; \ + res2i = y[2 * inc_y2 + 1]; \ + res3i = y[3 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + res1r += alphar * temp1r; \ + res1r OP0 alphai * temp1i; \ + res2r += alphar * temp2r; \ + res2r OP0 alphai * temp2i; \ + res3r += alphar * temp3r; \ + res3r OP0 alphai * temp3i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + res1i OP1 alphar * temp1i; \ + res1i OP2 alphai * temp1r; \ + res2i OP1 alphar * temp2i; \ + res2i OP2 alphai * temp2r; \ + res3i OP1 alphar * temp3i; \ + res3i OP2 alphai * temp3r; \ + \ + y[0 * inc_y2] = res0r; \ + y[1 * inc_y2] = res1r; \ + y[2 * inc_y2] = res2r; \ + y[3 * inc_y2] = res3r; \ + \ + y[0 * inc_y2 + 1] = res0i; \ + y[1 * inc_y2 + 1] = res1i; \ + y[2 * inc_y2 + 1] = res2i; \ + y[3 * inc_y2 + 1] = res3i; \ + +#define CSCALE_STORE_Y2_GP() \ + res0r = y[0 * inc_y2]; \ + res1r = y[1 * inc_y2]; \ + \ + res0i = y[0 * inc_y2 + 1]; \ + res1i = y[1 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + res1r += alphar * temp1r; \ + res1r OP0 alphai * temp1i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + res1i OP1 alphar * temp1i; \ + res1i OP2 alphai * temp1r; \ + \ + y[0 * inc_y2] = res0r; \ + y[1 * inc_y2] = res1r; \ + \ + y[0 * inc_y2 + 1] = res0i; \ + y[1 * inc_y2 + 1] = res1i; \ + + +#define CSCALE_STORE_Y1_GP() \ + res0r = y[0 * inc_y2]; \ + res0i = y[0 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + \ + y[0 * inc_y2] = res0r; \ + y[0 * inc_y2 + 1] = res0i; \ + +#define CLOAD_X8_VECTOR() \ + LD_SP4(x, 4, x0, x1, x2, x3); \ + PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ + PCKEVOD_W2_SP(x3, x2, x1r, x1i); \ + +#define CLOAD_X4_VECTOR() \ + LD_SP2(x, 4, x0, x1); \ + PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ + +#define CLOAD_X8_GP() \ + x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \ + x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \ + x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \ + x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \ + +#define CLOAD_X4_GP() \ + x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ + x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ + x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ + x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ + +#define CGEMV_T_MSA() \ + for (j = (n >> 2); j--;) \ + { \ + tp0r = tp1r = tp2r = tp3r = zero; \ + tp0i = tp1i = tp2i = tp3i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_X8() \ + CGEMV_T_8x4(); \ + \ + k += 2 * 8; \ + x += inc_x2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_X4(); \ + \ + CGEMV_T_4x4(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \ + tp0r, tp1r, tp2r, tp3r); \ + TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \ + tp0i, tp1i, tp2i, tp3i); \ + \ + tp0r += tp1r; \ + tp0r += tp2r; \ + tp0r += tp3r; \ + tp0i += tp1i; \ + tp0i += tp2i; \ + tp0i += tp3i; \ + \ + temp0r = tp0r[0]; \ + temp1r = tp0r[1]; \ + temp2r = tp0r[2]; \ + temp3r = tp0r[3]; \ + temp0i = tp0i[0]; \ + temp1i = tp0i[1]; \ + temp2i = tp0i[2]; \ + temp3i = tp0i[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_T_1x4(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + CSCALE_STORE_Y4_GP(); \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + y += 4 * inc_y2; \ + } \ + \ + if (n & 2) \ + { \ + tp0r = tp1r = zero; \ + tp0i = tp1i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_X8(); \ + \ + CGEMV_T_8x2(); \ + \ + k += 2 * 8; \ + x += inc_x2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_X4(); \ + \ + CGEMV_T_4x2(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \ + tp0r, tp1r, tp0i, tp1i); \ + \ + tp0r += tp1r; \ + tp0r += tp0i; \ + tp0r += tp1i; \ + \ + temp0r = tp0r[0]; \ + temp1r = tp0r[1]; \ + temp0i = tp0r[2]; \ + temp1i = tp0r[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_T_1x2(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + CSCALE_STORE_Y2_GP(); \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + y += 2 * inc_y2; \ + } \ + \ + if (n & 1) \ + { \ + tp0r = zero; \ + tp0i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + CLOAD_X8(); \ + \ + CGEMV_T_8x1(); \ + \ + k += 2 * 8; \ + x += inc_x2 * 8; \ + } \ + \ + if (m & 4) \ + { \ + CLOAD_X4(); \ + \ + CGEMV_T_4x1(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + ILVRL_W2_SP(tp0i, tp0r, t0, t1); \ + \ + t0 += t1; \ + \ + temp0r = t0[0] + t0[2]; \ + temp0i = t0[1] + t0[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + CGEMV_T_1x1(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + CSCALE_STORE_Y1_GP(); \ + \ + pa0 += lda2; \ + y += inc_y2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, + FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *srcx_org = x; + FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i; + FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i; + BLASLONG inc_x2, inc_y2, lda2; + v4f32 zero = {0}; + v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; + + lda2 = 2 * lda; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if (2 == inc_x2) + { + #define CLOAD_X8 CLOAD_X8_VECTOR + #define CLOAD_X4 CLOAD_X4_VECTOR + + CGEMV_T_MSA(); + + #undef CLOAD_X8 + #undef CLOAD_X4 + } + else + { + #define CLOAD_X8 CLOAD_X8_GP + #define CLOAD_X4 CLOAD_X4_GP + + CGEMV_T_MSA(); + + #undef CLOAD_X8 + #undef CLOAD_X4 + } + + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 diff --git a/kernel/mips/dasum_msa.c b/kernel/mips/dasum_msa.c new file mode 100644 index 000000000..a3641cd50 --- /dev/null +++ b/kernel/mips/dasum_msa.c @@ -0,0 +1,278 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include +#include "macros_msa.h" + +#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + FLOAT sumf = 0.0; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v2f64 zero_v = {0}; + v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + if (n > 15) + { + n -= 16; + + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_D(src0); + sum_abs1 = AND_VEC_D(src1); + sum_abs2 = AND_VEC_D(src2); + sum_abs3 = AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 4); i--;) + { + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + } + else if ((n & 8) && (n & 4)) + { + LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + } + else if ((n & 8) && (n & 2)) + { + LD_DP5_INC(x, 2, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + } + else if ((n & 4) && (n & 2)) + { + LD_DP3_INC(x, 2, src0, src1, src2); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + } + else if (n & 8) + { + LD_DP4_INC(x, 2, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + } + else if (n & 4) + { + LD_DP2_INC(x, 2, src0, src1); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + } + else if (n & 2) + { + src0 = LD_DP(x); x += 2; + + sum_abs0 += AND_VEC_D(src0); + } + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0] + sum_abs0[1]; + + if (n & 1) + { + sumf += fabs(*x); + } + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0] + sum_abs0[1]; + } + } + else + { + if (n > 8) + { + n -= 8; + + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_D(src0); + sum_abs1 = AND_VEC_D(src1); + sum_abs2 = AND_VEC_D(src2); + sum_abs3 = AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = (n >> 3); i--;) + { + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + } + else if ((n & 4) && (n & 2)) + { + LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + } + else if ((n & 4) && (n & 1)) + { + LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + } + else if ((n & 2) && (n & 1)) + { + LD_DP3_INC(x, inc_x, src0, src1, src2); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + } + else if (n & 4) + { + LD_DP4_INC(x, inc_x, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + } + else if (n & 2) + { + LD_DP2_INC(x, inc_x, src0, src1); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + } + else if (n & 1) + { + src0 = LD_DP(x); + + sum_abs0 += AND_VEC_D(src0); + } + } + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf = sum_abs0[0]; + } + + return (sumf); +} diff --git a/kernel/mips/ddot_msa.c b/kernel/mips/ddot_msa.c new file mode 100644 index 000000000..b56e10135 --- /dev/null +++ b/kernel/mips/ddot_msa.c @@ -0,0 +1,189 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +/* return float, x,y float */ +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + double dot = 0.0; + FLOAT x0, x1, x2, x3, y0, y1, y2, y3; + v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v2f64 dot0 = {0, 0}; + + if (n < 0) return (dot); + + if ((1 == inc_x) && (1 == inc_y)) + { + for (i = (n >> 4); i--;) + { + LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + dot0 += (vy7 * vx7); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6); + LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + } + else if ((n & 8) && (n & 4)) + { + LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5); + LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + } + else if ((n & 8) && (n & 2)) + { + LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4); + LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + } + else if ((n & 4) && (n & 2)) + { + LD_DP3_INC(x, 2, vx0, vx1, vx2); + LD_DP3_INC(y, 2, vy0, vy1, vy2); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + } + else if (n & 8) + { + LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + } + else if (n & 4) + { + LD_DP2_INC(x, 2, vx0, vx1); + LD_DP2_INC(y, 2, vy0, vy1); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + } + else if (n & 2) + { + vx0 = LD_DP(x); x += 2; + vy0 = LD_DP(y); y += 2; + + dot0 += (vy0 * vx0); + } + + if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + dot += dot0[0]; + dot += dot0[1]; + } + else + { + for (i = (n >> 2); i--;) + { + LD_GP4_INC(x, inc_x, x0, x1, x2, x3); + LD_GP4_INC(y, inc_y, y0, y1, y2, y3); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + dot += (y3 * x3); + } + + if ((n & 2) && (n & 1)) + { + LD_GP3_INC(x, inc_x, x0, x1, x2); + LD_GP3_INC(y, inc_y, y0, y1, y2); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + } + else if (n & 2) + { + LD_GP2_INC(x, inc_x, x0, x1); + LD_GP2_INC(y, inc_y, y0, y1); + + dot += (y0 * x0); + dot += (y1 * x1); + } + else if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + return (dot); +} diff --git a/kernel/mips/dgemv_n_msa.c b/kernel/mips/dgemv_n_msa.c new file mode 100644 index 000000000..09bb063ff --- /dev/null +++ b/kernel/mips/dgemv_n_msa.c @@ -0,0 +1,577 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define DGEMV_N_8x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ + LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ + LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ + LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + y2 += tp0 * t2; \ + y3 += tp0 * t3; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + y2 += tp1 * t6; \ + y3 += tp1 * t7; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + y2 += tp2 * t10; \ + y3 += tp2 * t11; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ + y2 += tp3 * t14; \ + y3 += tp3 * t15; \ + \ + y0 += tp4 * t16; \ + y1 += tp4 * t17; \ + y2 += tp4 * t18; \ + y3 += tp4 * t19; \ + \ + y0 += tp5 * t20; \ + y1 += tp5 * t21; \ + y2 += tp5 * t22; \ + y3 += tp5 * t23; \ + \ + y0 += tp6 * t24; \ + y1 += tp6 * t25; \ + y2 += tp6 * t26; \ + y3 += tp6 * t27; \ + \ + y0 += tp7 * t28; \ + y1 += tp7 * t29; \ + y2 += tp7 * t30; \ + y3 += tp7 * t31; \ +} + +#define DGEMV_N_4x8() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + LD_DP2(pa4 + k, 2, t16, t17); \ + LD_DP2(pa5 + k, 2, t20, t21); \ + LD_DP2(pa6 + k, 2, t24, t25); \ + LD_DP2(pa7 + k, 2, t28, t29); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ + \ + y0 += tp4 * t16; \ + y1 += tp4 * t17; \ + \ + y0 += tp5 * t20; \ + y1 += tp5 * t21; \ + \ + y0 += tp6 * t24; \ + y1 += tp6 * t25; \ + \ + y0 += tp7 * t28; \ + y1 += tp7 * t29; \ +} + +#define DGEMV_N_8x4() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + y2 += tp0 * t2; \ + y3 += tp0 * t3; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + y2 += tp1 * t6; \ + y3 += tp1 * t7; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + y2 += tp2 * t10; \ + y3 += tp2 * t11; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ + y2 += tp3 * t14; \ + y3 += tp3 * t15; \ +} + +#define DGEMV_N_4x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + \ + y0 += tp2 * t8; \ + y1 += tp2 * t9; \ + \ + y0 += tp3 * t12; \ + y1 += tp3 * t13; \ +} + +#define DGEMV_N_8x2() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + y2 += tp0 * t2; \ + y3 += tp0 * t3; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ + y2 += tp1 * t6; \ + y3 += tp1 * t7; \ +} + +#define DGEMV_N_4x2() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t4; \ + y1 += tp1 * t5; \ +} + +#define DLOAD_X8_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ + tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ + tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ + tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ + tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \ + tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \ + tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \ + tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \ + +#define DLOAD_X4_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ + tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ + tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ + tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ + +#define DLOAD_X8_SCALE_VECTOR() \ + LD_DP4(x, 2, x0, x1, x2, x3); \ + \ + x0 = x0 * v_alpha; \ + x1 = x1 * v_alpha; \ + x2 = x2 * v_alpha; \ + x3 = x3 * v_alpha; \ + \ + SPLATI_D2_DP(x0, tp0, tp1); \ + SPLATI_D2_DP(x1, tp2, tp3); \ + SPLATI_D2_DP(x2, tp4, tp5); \ + SPLATI_D2_DP(x3, tp6, tp7); \ + +#define DLOAD_X4_SCALE_VECTOR() \ + LD_DP2(x, 2, x0, x1); \ + \ + x0 = x0 * v_alpha; \ + x1 = x1 * v_alpha; \ + \ + SPLATI_D2_DP(x0, tp0, tp1); \ + SPLATI_D2_DP(x1, tp2, tp3); \ + +#define DLOAD_Y8_GP() \ + y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ + y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ + y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \ + y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \ + y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \ + y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \ + +#define DLOAD_Y4_GP() \ + y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ + y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ + y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ + +#define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3); +#define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1); + +#define DSTORE_Y8_GP() \ + *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ + *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ + *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ + *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ + *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \ + *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \ + *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \ + *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \ + +#define DSTORE_Y4_GP() \ + *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ + *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ + *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ + *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ + +#define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2); +#define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2); + +#define DGEMV_N_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + DLOAD_X8_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_Y8(); \ + DGEMV_N_8x8(); \ + DSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_Y4(); \ + DGEMV_N_4x8(); \ + DSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + temp += temp4 * pa4[k]; \ + temp += temp5 * pa5[k]; \ + temp += temp6 * pa6[k]; \ + temp += temp7 * pa7[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + \ + x += 8 * inc_x; \ + } \ + \ + if (n & 4) \ + { \ + DLOAD_X4_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_Y8(); \ + DGEMV_N_8x4(); \ + DSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_Y4(); \ + DGEMV_N_4x4(); \ + DSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + \ + x += 4 * inc_x; \ + } \ + \ + if (n & 2) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ + tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_Y8(); \ + DGEMV_N_8x2(); \ + DSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_Y4(); \ + DGEMV_N_4x2(); \ + DSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + \ + x += 2 * inc_x; \ + } \ + \ + if (n & 1) \ + { \ + temp = alpha * x[0]; \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = m; i--;) \ + { \ + y[0] += temp * pa0[k]; \ + y += inc_y; \ + k++; \ + } \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v2f64 v_alpha; + v2f64 x0, x1, x2, x3, y0, y1, y2, y3; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + + v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); + + pa0 = A; + pa1 = A + lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if ((1 == inc_x) && (1 == inc_y)) + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR + #define DLOAD_Y8 DLOAD_Y8_VECTOR + #define DLOAD_Y4 DLOAD_Y4_VECTOR + #define DSTORE_Y8 DSTORE_Y8_VECTOR + #define DSTORE_Y4 DSTORE_Y4_VECTOR + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + else if (1 == inc_y) + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP + #define DLOAD_Y8 DLOAD_Y8_VECTOR + #define DLOAD_Y4 DLOAD_Y4_VECTOR + #define DSTORE_Y8 DSTORE_Y8_VECTOR + #define DSTORE_Y4 DSTORE_Y4_VECTOR + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + else if (1 == inc_x) + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR + #define DLOAD_Y8 DLOAD_Y8_GP + #define DLOAD_Y4 DLOAD_Y4_GP + #define DSTORE_Y8 DSTORE_Y8_GP + #define DSTORE_Y4 DSTORE_Y4_GP + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + else + { + #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP + #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP + #define DLOAD_Y8 DLOAD_Y8_GP + #define DLOAD_Y4 DLOAD_Y4_GP + #define DSTORE_Y8 DSTORE_Y8_GP + #define DSTORE_Y4 DSTORE_Y4_GP + + DGEMV_N_MSA(); + + #undef DLOAD_X8_SCALE + #undef DLOAD_X4_SCALE + #undef DLOAD_Y8 + #undef DLOAD_Y4 + #undef DSTORE_Y8 + #undef DSTORE_Y4 + } + + return(0); +} diff --git a/kernel/mips/dgemv_t_msa.c b/kernel/mips/dgemv_t_msa.c new file mode 100644 index 000000000..f74cb2e66 --- /dev/null +++ b/kernel/mips/dgemv_t_msa.c @@ -0,0 +1,589 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define DGEMV_T_8x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ + LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ + LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ + LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + tp0 += x2 * t2; \ + tp0 += x3 * t3; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + tp1 += x2 * t6; \ + tp1 += x3 * t7; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + tp2 += x2 * t10; \ + tp2 += x3 * t11; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ + tp3 += x2 * t14; \ + tp3 += x3 * t15; \ + \ + tp4 += x0 * t16; \ + tp4 += x1 * t17; \ + tp4 += x2 * t18; \ + tp4 += x3 * t19; \ + \ + tp5 += x0 * t20; \ + tp5 += x1 * t21; \ + tp5 += x2 * t22; \ + tp5 += x3 * t23; \ + \ + tp6 += x0 * t24; \ + tp6 += x1 * t25; \ + tp6 += x2 * t26; \ + tp6 += x3 * t27; \ + \ + tp7 += x0 * t28; \ + tp7 += x1 * t29; \ + tp7 += x2 * t30; \ + tp7 += x3 * t31; \ +} + +#define DGEMV_T_8x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + LD_DP2(pa4 + k, 2, t16, t17); \ + LD_DP2(pa5 + k, 2, t20, t21); \ + LD_DP2(pa6 + k, 2, t24, t25); \ + LD_DP2(pa7 + k, 2, t28, t29); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ + \ + tp4 += x0 * t16; \ + tp4 += x1 * t17; \ + \ + tp5 += x0 * t20; \ + tp5 += x1 * t21; \ + \ + tp6 += x0 * t24; \ + tp6 += x1 * t25; \ + \ + tp7 += x0 * t28; \ + tp7 += x1 * t29; \ +} + +#define DGEMV_T_8x2() \ +{ \ + t0 = LD_DP(pa0 + k); \ + t4 = LD_DP(pa1 + k); \ + t8 = LD_DP(pa2 + k); \ + t12 = LD_DP(pa3 + k); \ + t16 = LD_DP(pa4 + k); \ + t20 = LD_DP(pa5 + k); \ + t24 = LD_DP(pa6 + k); \ + t28 = LD_DP(pa7 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t4; \ + tp2 += x0 * t8; \ + tp3 += x0 * t12; \ + tp4 += x0 * t16; \ + tp5 += x0 * t20; \ + tp6 += x0 * t24; \ + tp7 += x0 * t28; \ +} + +#define DGEMV_T_4x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + tp0 += x2 * t2; \ + tp0 += x3 * t3; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + tp1 += x2 * t6; \ + tp1 += x3 * t7; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + tp2 += x2 * t10; \ + tp2 += x3 * t11; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ + tp3 += x2 * t14; \ + tp3 += x3 * t15; \ +} + +#define DGEMV_T_4x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + \ + tp2 += x0 * t8; \ + tp2 += x1 * t9; \ + \ + tp3 += x0 * t12; \ + tp3 += x1 * t13; \ +} + +#define DGEMV_T_4x2() \ +{ \ + t0 = LD_DP(pa0 + k); \ + t4 = LD_DP(pa1 + k); \ + t8 = LD_DP(pa2 + k); \ + t12 = LD_DP(pa3 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t4; \ + tp2 += x0 * t8; \ + tp3 += x0 * t12; \ +} + +#define DGEMV_T_2x8() \ +{ \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + tp0 += x2 * t2; \ + tp0 += x3 * t3; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ + tp1 += x2 * t6; \ + tp1 += x3 * t7; \ +} + +#define DGEMV_T_2x4() \ +{ \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t4; \ + tp1 += x1 * t5; \ +} + +#define DGEMV_T_2x2() \ +{ \ + t0 = LD_DP(pa0 + k); \ + t4 = LD_DP(pa1 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t4; \ +} + +#define DLOAD_X8_GP() \ + x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ + x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ + x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \ + x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \ + x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \ + x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \ + +#define DLOAD_X4_GP() \ + x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ + x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ + x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ + +#define DLOAD_X2_GP() \ + x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ + x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ + +#define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3); +#define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1); +#define DLOAD_X2_VECTOR() x0 = LD_DP(x); + +#define DGEMV_T_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + tp4 = zero; \ + tp5 = zero; \ + tp6 = zero; \ + tp7 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_X8(); \ + DGEMV_T_8x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_X4(); \ + DGEMV_T_8x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + if (m & 2) \ + { \ + DLOAD_X2(); \ + DGEMV_T_8x2(); \ + \ + x += 2 * inc_x; \ + k += 2; \ + } \ + \ + ILVRL_D2_DP(tp1, tp0, t0, t4); \ + ILVRL_D2_DP(tp3, tp2, t1, t5); \ + ILVRL_D2_DP(tp5, tp4, t2, t6); \ + ILVRL_D2_DP(tp7, tp6, t3, t7); \ + ADD2(t0, t4, t1, t5, t0, t1); \ + ADD2(t2, t6, t3, t7, t2, t3); \ + \ + temp0 = t0[0]; \ + temp1 = t0[1]; \ + temp2 = t1[0]; \ + temp3 = t1[1]; \ + temp4 = t2[0]; \ + temp5 = t2[1]; \ + temp6 = t3[0]; \ + temp7 = t3[1]; \ + \ + if (m & 1) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + temp4 += pa4[k] * x[0]; \ + temp5 += pa5[k] * x[0]; \ + temp6 += pa6[k] * x[0]; \ + temp7 += pa7[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + res4 = y[4 * inc_y]; \ + res5 = y[5 * inc_y]; \ + res6 = y[6 * inc_y]; \ + res7 = y[7 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + res4 += alpha * temp4; \ + res5 += alpha * temp5; \ + res6 += alpha * temp6; \ + res7 += alpha * temp7; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + y[4 * inc_y] = res4; \ + y[5 * inc_y] = res5; \ + y[6 * inc_y] = res6; \ + y[7 * inc_y] = res7; \ + \ + y += 8 * inc_y; \ + \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + } \ + \ + if (n & 4) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_X8(); \ + DGEMV_T_4x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_X4(); \ + DGEMV_T_4x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + if (m & 2) \ + { \ + DLOAD_X2(); \ + DGEMV_T_4x2(); \ + \ + x += 2 * inc_x; \ + k += 2; \ + } \ + \ + ILVRL_D2_DP(tp1, tp0, t0, t4); \ + ILVRL_D2_DP(tp3, tp2, t1, t5); \ + ADD2(t0, t4, t1, t5, t0, t1); \ + \ + temp0 = t0[0]; \ + temp1 = t0[1]; \ + temp2 = t1[0]; \ + temp3 = t1[1]; \ + \ + if (m & 1) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + \ + y += 4 * inc_y; \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + } \ + \ + if (n & 2) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + DLOAD_X8(); \ + DGEMV_T_2x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + DLOAD_X4(); \ + DGEMV_T_2x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + if (m & 2) \ + { \ + DLOAD_X2(); \ + DGEMV_T_2x2(); \ + \ + x += 2 * inc_x; \ + k += 2; \ + } \ + \ + ILVRL_D2_DP(tp1, tp0, t0, t4); \ + \ + t0 += t4; \ + \ + temp0 = t0[0]; \ + temp1 = t0[1]; \ + \ + if (m & 1) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + \ + y += 2 * inc_y; \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + } \ + \ + if (n & 1) \ + { \ + temp0 = 0.0; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = m; i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + x += inc_x; \ + k++; \ + } \ + \ + y[0] += alpha * temp0; \ + y += inc_y; \ + pa0 += lda; \ + } + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *srcx_org = x; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + FLOAT res0, res1, res2, res3, res4, res5, res6, res7; + v2f64 x0, x1, x2, x3; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + v2f64 zero = {0}; + + pa0 = A + 0 * lda; + pa1 = A + 1 * lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if (1 == inc_x) + { + #define DLOAD_X8 DLOAD_X8_VECTOR + #define DLOAD_X4 DLOAD_X4_VECTOR + #define DLOAD_X2 DLOAD_X2_VECTOR + + DGEMV_T_MSA(); + + #undef DLOAD_X8 + #undef DLOAD_X4 + #undef DLOAD_X2 + } + else + { + #define DLOAD_X8 DLOAD_X8_GP + #define DLOAD_X4 DLOAD_X4_GP + #define DLOAD_X2 DLOAD_X2_GP + + DGEMV_T_MSA(); + + #undef DLOAD_X8 + #undef DLOAD_X4 + #undef DLOAD_X2 + } + + return(0); +} diff --git a/kernel/mips/sasum_msa.c b/kernel/mips/sasum_msa.c new file mode 100644 index 000000000..e968f8307 --- /dev/null +++ b/kernel/mips/sasum_msa.c @@ -0,0 +1,333 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include +#include "macros_msa.h" + +#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + FLOAT data0, data1, data2, sumf = 0.0; + v4f32 src0, src1, src2, src3, src4, src5, src6, src7; + v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v4f32 zero_v = {0}; + v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + if (n > 31) + { + n -= 32; + + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src1); + sum_abs2 = AND_VEC_W(src2); + sum_abs3 = AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + sum_abs2 = zero_v; + sum_abs3 = zero_v; + } + + for (i = 0; i < (n >> 5); i++) + { + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + } + + if (n & 31) + { + if ((n & 16) && (n & 8) && (n & 4)) + { + LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 16) && (n & 8)) + { + LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 16) && (n & 4)) + { + LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if ((n & 8) && (n & 4)) + { + LD_SP3_INC(x, 4, src0, src1, src2); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 16) + { + LD_SP4_INC(x, 4, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 8) + { + LD_SP2_INC(x, 4, src0, src1); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else if (n & 4) + { + src0 = LD_SP(x); x += 4; + + sum_abs0 += AND_VEC_W(src0); + + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + + if (n & 2) + { + sumf += fabsf(*(x + 0)); + sumf += fabsf(*(x + 1)); + x += 2; + } + + if (n & 1) + { + sumf += fabsf(*(x + 0)); + } + } + else + { + sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + } + } + else + { + if (n > 8) + { + n -= 8; + + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); + x += inc_x; + + sum_abs0 = AND_VEC_W(src0); + sum_abs1 = AND_VEC_W(src4); + } + else + { + sum_abs0 = zero_v; + sum_abs1 = zero_v; + } + + for (i = (n >> 3); i--;) + { + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); + x += inc_x; + src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); + x += inc_x; + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src4); + } + + if (n & 4) + { + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + + sum_abs0 += AND_VEC_W(src0); + } + + sum_abs0 += sum_abs1; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; + + if ((n & 2) && (n & 1)) + { + data0 = fabsf(*x); x += inc_x; + data1 = fabsf(*x); x += inc_x; + data2 = fabsf(*x); + + sumf += data0; + sumf += data1; + sumf += data2; + } + else if (n & 2) + { + data0 = fabsf(*x); x += inc_x; + data1 = fabsf(*x); + + sumf += data0; + sumf += data1; + } + else if (n & 1) + { + data0 = fabsf(*x); + + sumf += data0; + } + } + + return (sumf); +} diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c new file mode 100644 index 000000000..1997ec5a0 --- /dev/null +++ b/kernel/mips/sdot_msa.c @@ -0,0 +1,208 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +/* return float, x,y float */ +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + double dot = 0.0; + float x0, x1, x2, x3, y0, y1, y2, y3; + v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v4f32 dot0 = {0, 0, 0, 0}; + + if (n < 0) return (dot); + + if ((1 == inc_x) && (1 == inc_y)) + { + for (i = (n >> 5); i--;) + { + LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + dot0 += (vy7 * vx7); + } + + if (n & 31) + { + if ((n & 16) && (n & 8) && (n & 4)) + { + LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6); + LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + dot0 += (vy6 * vx6); + } + else if ((n & 16) && (n & 8)) + { + LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5); + LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + dot0 += (vy5 * vx5); + } + else if ((n & 16) && (n & 4)) + { + LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4); + LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + dot0 += (vy4 * vx4); + } + else if ((n & 8) && (n & 4)) + { + LD_SP3_INC(x, 4, vx0, vx1, vx2); + LD_SP3_INC(y, 4, vy0, vy1, vy2); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + } + else if (n & 16) + { + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + dot0 += (vy2 * vx2); + dot0 += (vy3 * vx3); + } + else if (n & 8) + { + LD_SP2_INC(x, 4, vx0, vx1); + LD_SP2_INC(y, 4, vy0, vy1); + + dot0 += (vy0 * vx0); + dot0 += (vy1 * vx1); + } + else if (n & 4) + { + vx0 = LD_SP(x); x += 4; + vy0 = LD_SP(y); y += 4; + + dot0 += (vy0 * vx0); + } + + if ((n & 2) && (n & 1)) + { + LD_GP3_INC(x, 1, x0, x1, x2); + LD_GP3_INC(y, 1, y0, y1, y2); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + } + else if (n & 2) + { + LD_GP2_INC(x, 1, x0, x1); + LD_GP2_INC(y, 1, y0, y1); + + dot += (y0 * x0); + dot += (y1 * x1); + } + else if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + dot += dot0[0]; + dot += dot0[1]; + dot += dot0[2]; + dot += dot0[3]; + } + else + { + for (i = (n >> 2); i--;) + { + LD_GP4_INC(x, inc_x, x0, x1, x2, x3); + LD_GP4_INC(y, inc_y, y0, y1, y2, y3); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + dot += (y3 * x3); + } + + if ((n & 2) && (n & 1)) + { + LD_GP3_INC(x, inc_x, x0, x1, x2); + LD_GP3_INC(y, inc_y, y0, y1, y2); + + dot += (y0 * x0); + dot += (y1 * x1); + dot += (y2 * x2); + } + else if (n & 2) + { + LD_GP2_INC(x, inc_x, x0, x1); + LD_GP2_INC(y, inc_y, y0, y1); + + dot += (y0 * x0); + dot += (y1 * x1); + } + else if (n & 1) + { + x0 = *x; + y0 = *y; + + dot += (y0 * x0); + } + } + + return (dot); +} diff --git a/kernel/mips/sgemv_n_msa.c b/kernel/mips/sgemv_n_msa.c new file mode 100644 index 000000000..ae6e6558f --- /dev/null +++ b/kernel/mips/sgemv_n_msa.c @@ -0,0 +1,515 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define SGEMV_N_8x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + LD_SP2(pa4 + k, 4, t8, t9); \ + LD_SP2(pa5 + k, 4, t10, t11); \ + LD_SP2(pa6 + k, 4, t12, t13); \ + LD_SP2(pa7 + k, 4, t14, t15); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t2; \ + y1 += tp1 * t3; \ + \ + y0 += tp2 * t4; \ + y1 += tp2 * t5; \ + \ + y0 += tp3 * t6; \ + y1 += tp3 * t7; \ + \ + y0 += tp4 * t8; \ + y1 += tp4 * t9; \ + \ + y0 += tp5 * t10; \ + y1 += tp5 * t11; \ + \ + y0 += tp6 * t12; \ + y1 += tp6 * t13; \ + \ + y0 += tp7 * t14; \ + y1 += tp7 * t15; \ +} + +#define SGEMV_N_4x8() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + t8 = LD_SP(pa4 + k); \ + t10 = LD_SP(pa5 + k); \ + t12 = LD_SP(pa6 + k); \ + t14 = LD_SP(pa7 + k); \ + \ + y0 += tp0 * t0; \ + y0 += tp1 * t2; \ + y0 += tp2 * t4; \ + y0 += tp3 * t6; \ + y0 += tp4 * t8; \ + y0 += tp5 * t10; \ + y0 += tp6 * t12; \ + y0 += tp7 * t14; \ +} + +#define SGEMV_N_8x4() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t2; \ + y1 += tp1 * t3; \ + \ + y0 += tp2 * t4; \ + y1 += tp2 * t5; \ + \ + y0 += tp3 * t6; \ + y1 += tp3 * t7; \ +} + +#define SGEMV_N_4x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + \ + y0 += tp0 * t0; \ + y0 += tp1 * t2; \ + y0 += tp2 * t4; \ + y0 += tp3 * t6; \ +} + +#define SGEMV_N_8x2() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + \ + y0 += tp0 * t0; \ + y1 += tp0 * t1; \ + \ + y0 += tp1 * t2; \ + y1 += tp1 * t3; \ +} + +#define SGEMV_N_4x2() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + \ + y0 += tp0 * t0; \ + y0 += tp1 * t2; \ +} + +#define SLOAD_X8_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ + tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ + tp2 = COPY_FLOAT_TO_VECTOR(temp2); \ + tp3 = COPY_FLOAT_TO_VECTOR(temp3); \ + tp4 = COPY_FLOAT_TO_VECTOR(temp4); \ + tp5 = COPY_FLOAT_TO_VECTOR(temp5); \ + tp6 = COPY_FLOAT_TO_VECTOR(temp6); \ + tp7 = COPY_FLOAT_TO_VECTOR(temp7); \ + +#define SLOAD_X4_SCALE_GP() \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ + tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ + tp2 = COPY_FLOAT_TO_VECTOR(temp2); \ + tp3 = COPY_FLOAT_TO_VECTOR(temp3); \ + +#define SLOAD_X8_SCALE_VECTOR() \ + LD_SP2(x, 4, x0, x1); \ + \ + x0 = x0 * v_alpha; \ + x1 = x1 * v_alpha; \ + \ + SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \ + SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7); \ + +#define SLOAD_X4_SCALE_VECTOR() \ + x0 = LD_SP(x); \ + x0 = x0 * v_alpha; \ + SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \ + +#define SLOAD_Y8_GP() \ + y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) y1, 1, *((int *)(y + 5 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) y1, 2, *((int *)(y + 6 * inc_y))); \ + y1 = (v4f32) __msa_insert_w((v4i32) y1, 3, *((int *)(y + 7 * inc_y))); \ + +#define SLOAD_Y4_GP() \ + y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \ + y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \ + +#define SLOAD_Y8_VECTOR() LD_SP2(y, 4, y0, y1); +#define SLOAD_Y4_VECTOR() y0 = LD_SP(y); + +#define SSTORE_Y8_GP() \ + *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \ + *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \ + *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \ + *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \ + *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0); \ + *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1); \ + *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2); \ + *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3); \ + +#define SSTORE_Y4_GP() \ + *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \ + *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \ + *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \ + *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \ + +#define SSTORE_Y8_VECTOR() ST_SP2(y0, y1, y, 4); +#define SSTORE_Y4_VECTOR() ST_SP(y0, y); + +#define SGEMV_N_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + SLOAD_X8_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_Y8(); \ + SGEMV_N_8x8(); \ + SSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_Y4(); \ + SGEMV_N_4x8(); \ + SSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + temp4 = alpha * x[4 * inc_x]; \ + temp5 = alpha * x[5 * inc_x]; \ + temp6 = alpha * x[6 * inc_x]; \ + temp7 = alpha * x[7 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + temp += temp4 * pa4[k]; \ + temp += temp5 * pa5[k]; \ + temp += temp6 * pa6[k]; \ + temp += temp7 * pa7[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + \ + x += 8 * inc_x; \ + } \ + \ + if (n & 4) \ + { \ + SLOAD_X4_SCALE(); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_Y8(); \ + SGEMV_N_8x4(); \ + SSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_Y4(); \ + SGEMV_N_4x4(); \ + SSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + temp2 = alpha * x[2 * inc_x]; \ + temp3 = alpha * x[3 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + temp += temp2 * pa2[k]; \ + temp += temp3 * pa3[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + \ + x += 4 * inc_x; \ + } \ + \ + if (n & 2) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ + tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_Y8(); \ + SGEMV_N_8x2(); \ + SSTORE_Y8(); \ + \ + y += 8 * inc_y; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_Y4(); \ + SGEMV_N_4x2(); \ + SSTORE_Y4(); \ + \ + y += 4 * inc_y; \ + k += 4; \ + } \ + \ + if (m & 3) \ + { \ + temp0 = alpha * x[0 * inc_x]; \ + temp1 = alpha * x[1 * inc_x]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp = y[0]; \ + temp += temp0 * pa0[k]; \ + temp += temp1 * pa1[k]; \ + y[0] = temp; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + \ + x += 2 * inc_x; \ + } \ + \ + if (n & 1) \ + { \ + temp = alpha * x[0]; \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = m; i--;) \ + { \ + y[0] += temp * pa0[k]; \ + \ + y += inc_y; \ + k++; \ + } \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v4f32 v_alpha, x0, x1, y0, y1; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + + v_alpha = COPY_FLOAT_TO_VECTOR(alpha); + + pa0 = A; + pa1 = A + lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if ((1 == inc_x) && (1 == inc_y)) + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR + #define SLOAD_Y8 SLOAD_Y8_VECTOR + #define SLOAD_Y4 SLOAD_Y4_VECTOR + #define SSTORE_Y8 SSTORE_Y8_VECTOR + #define SSTORE_Y4 SSTORE_Y4_VECTOR + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + else if (1 == inc_y) + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP + #define SLOAD_Y8 SLOAD_Y8_VECTOR + #define SLOAD_Y4 SLOAD_Y4_VECTOR + #define SSTORE_Y8 SSTORE_Y8_VECTOR + #define SSTORE_Y4 SSTORE_Y4_VECTOR + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + else if (1 == inc_x) + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR + #define SLOAD_Y8 SLOAD_Y8_GP + #define SLOAD_Y4 SLOAD_Y4_GP + #define SSTORE_Y8 SSTORE_Y8_GP + #define SSTORE_Y4 SSTORE_Y4_GP + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + else + { + #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP + #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP + #define SLOAD_Y8 SLOAD_Y8_GP + #define SLOAD_Y4 SLOAD_Y4_GP + #define SSTORE_Y8 SSTORE_Y8_GP + #define SSTORE_Y4 SSTORE_Y4_GP + + SGEMV_N_MSA(); + + #undef SLOAD_X8_SCALE + #undef SLOAD_X4_SCALE + #undef SLOAD_Y8 + #undef SLOAD_Y4 + #undef SSTORE_Y8 + #undef SSTORE_Y4 + } + + return(0); +} diff --git a/kernel/mips/sgemv_t_msa.c b/kernel/mips/sgemv_t_msa.c new file mode 100644 index 000000000..1c7f2998f --- /dev/null +++ b/kernel/mips/sgemv_t_msa.c @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define SGEMV_T_8x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + LD_SP2(pa4 + k, 4, t8, t9); \ + LD_SP2(pa5 + k, 4, t10, t11); \ + LD_SP2(pa6 + k, 4, t12, t13); \ + LD_SP2(pa7 + k, 4, t14, t15); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t2; \ + tp1 += x1 * t3; \ + \ + tp2 += x0 * t4; \ + tp2 += x1 * t5; \ + \ + tp3 += x0 * t6; \ + tp3 += x1 * t7; \ + \ + tp4 += x0 * t8; \ + tp4 += x1 * t9; \ + \ + tp5 += x0 * t10; \ + tp5 += x1 * t11; \ + \ + tp6 += x0 * t12; \ + tp6 += x1 * t13; \ + \ + tp7 += x0 * t14; \ + tp7 += x1 * t15; \ +} + +#define SGEMV_T_8x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + t8 = LD_SP(pa4 + k); \ + t10 = LD_SP(pa5 + k); \ + t12 = LD_SP(pa6 + k); \ + t14 = LD_SP(pa7 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t2; \ + tp2 += x0 * t4; \ + tp3 += x0 * t6; \ + tp4 += x0 * t8; \ + tp5 += x0 * t10; \ + tp6 += x0 * t12; \ + tp7 += x0 * t14; \ +} + +#define SGEMV_T_4x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + LD_SP2(pa2 + k, 4, t4, t5); \ + LD_SP2(pa3 + k, 4, t6, t7); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t2; \ + tp1 += x1 * t3; \ + \ + tp2 += x0 * t4; \ + tp2 += x1 * t5; \ + \ + tp3 += x0 * t6; \ + tp3 += x1 * t7; \ +} + +#define SGEMV_T_4x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + t4 = LD_SP(pa2 + k); \ + t6 = LD_SP(pa3 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t2; \ + tp2 += x0 * t4; \ + tp3 += x0 * t6; \ +} + +#define SGEMV_T_2x8() \ +{ \ + LD_SP2(pa0 + k, 4, t0, t1); \ + LD_SP2(pa1 + k, 4, t2, t3); \ + \ + tp0 += x0 * t0; \ + tp0 += x1 * t1; \ + \ + tp1 += x0 * t2; \ + tp1 += x1 * t3; \ +} + +#define SGEMV_T_2x4() \ +{ \ + t0 = LD_SP(pa0 + k); \ + t2 = LD_SP(pa1 + k); \ + \ + tp0 += x0 * t0; \ + tp1 += x0 * t2; \ +} + +#define SLOAD_X8_GP() \ + x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *)(x + 5 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *)(x + 6 * inc_x))); \ + x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *)(x + 7 * inc_x))); \ + +#define SLOAD_X4_GP() \ + x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \ + x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \ + +#define SLOAD_X8_VECTOR() LD_SP2(x, 4, x0, x1); +#define SLOAD_X4_VECTOR() x0 = LD_SP(x); + +#define SGEMV_T_MSA() \ + for (j = (n >> 3); j--;) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + tp4 = zero; \ + tp5 = zero; \ + tp6 = zero; \ + tp7 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_X8(); \ + SGEMV_T_8x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_X4(); \ + SGEMV_T_8x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \ + tp0, tp1, tp2, tp3); \ + TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7, \ + tp4, tp5, tp6, tp7); \ + tp0 += tp1; \ + tp0 += tp2; \ + tp0 += tp3; \ + tp4 += tp5; \ + tp4 += tp6; \ + tp4 += tp7; \ + \ + temp0 = tp0[0]; \ + temp1 = tp0[1]; \ + temp2 = tp0[2]; \ + temp3 = tp0[3]; \ + temp4 = tp4[0]; \ + temp5 = tp4[1]; \ + temp6 = tp4[2]; \ + temp7 = tp4[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + temp4 += pa4[k] * x[0]; \ + temp5 += pa5[k] * x[0]; \ + temp6 += pa6[k] * x[0]; \ + temp7 += pa7[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + res4 = y[4 * inc_y]; \ + res5 = y[5 * inc_y]; \ + res6 = y[6 * inc_y]; \ + res7 = y[7 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + res4 += alpha * temp4; \ + res5 += alpha * temp5; \ + res6 += alpha * temp6; \ + res7 += alpha * temp7; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + y[4 * inc_y] = res4; \ + y[5 * inc_y] = res5; \ + y[6 * inc_y] = res6; \ + y[7 * inc_y] = res7; \ + \ + y += 8 * inc_y; \ + \ + pa0 += 8 * lda; \ + pa1 += 8 * lda; \ + pa2 += 8 * lda; \ + pa3 += 8 * lda; \ + pa4 += 8 * lda; \ + pa5 += 8 * lda; \ + pa6 += 8 * lda; \ + pa7 += 8 * lda; \ + } \ + \ + if (n & 4) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + tp2 = zero; \ + tp3 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_X8(); \ + SGEMV_T_4x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_X4(); \ + SGEMV_T_4x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \ + tp0, tp1, tp2, tp3); \ + tp0 += tp1; \ + tp0 += tp2; \ + tp0 += tp3; \ + \ + temp0 = tp0[0]; \ + temp1 = tp0[1]; \ + temp2 = tp0[2]; \ + temp3 = tp0[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + temp2 += pa2[k] * x[0]; \ + temp3 += pa3[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + res2 = y[2 * inc_y]; \ + res3 = y[3 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + res2 += alpha * temp2; \ + res3 += alpha * temp3; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + y[2 * inc_y] = res2; \ + y[3 * inc_y] = res3; \ + \ + y += 4 * inc_y; \ + \ + pa0 += 4 * lda; \ + pa1 += 4 * lda; \ + pa2 += 4 * lda; \ + pa3 += 4 * lda; \ + } \ + \ + if (n & 2) \ + { \ + tp0 = zero; \ + tp1 = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 3); i--;) \ + { \ + SLOAD_X8(); \ + SGEMV_T_2x8(); \ + \ + x += 8 * inc_x; \ + k += 8; \ + } \ + \ + if (m & 4) \ + { \ + SLOAD_X4(); \ + SGEMV_T_2x4(); \ + \ + x += 4 * inc_x; \ + k += 4; \ + } \ + \ + ILVRL_W2_SP(tp1, tp0, tp2, tp3); \ + \ + tp2 += tp3; \ + \ + temp0 = tp2[0] + tp2[2]; \ + temp1 = tp2[1] + tp2[3]; \ + \ + for (i = (m & 3); i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + temp1 += pa1[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + res0 = y[0 * inc_y]; \ + res1 = y[1 * inc_y]; \ + \ + res0 += alpha * temp0; \ + res1 += alpha * temp1; \ + \ + y[0 * inc_y] = res0; \ + y[1 * inc_y] = res1; \ + \ + y += 2 * inc_y; \ + \ + pa0 += 2 * lda; \ + pa1 += 2 * lda; \ + } \ + \ + if (n & 1) \ + { \ + temp0 = 0.0; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = m; i--;) \ + { \ + temp0 += pa0[k] * x[0]; \ + \ + x += inc_x; \ + k++; \ + } \ + \ + y[0] += alpha * temp0; \ + y += inc_y; \ + pa0 += lda; \ + } + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *srcx_org = x; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + FLOAT res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 x0, x1; + v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + v4f32 zero = {0}; + + pa0 = A + 0 * lda; + pa1 = A + 1 * lda; + pa2 = A + 2 * lda; + pa3 = A + 3 * lda; + pa4 = A + 4 * lda; + pa5 = A + 5 * lda; + pa6 = A + 6 * lda; + pa7 = A + 7 * lda; + + if (1 == inc_x) + { + #define SLOAD_X8 SLOAD_X8_VECTOR + #define SLOAD_X4 SLOAD_X4_VECTOR + + SGEMV_T_MSA(); + + #undef SLOAD_X8 + #undef SLOAD_X4 + } + else + { + #define SLOAD_X8 SLOAD_X8_GP + #define SLOAD_X4 SLOAD_X4_GP + + SGEMV_T_MSA(); + + #undef SLOAD_X8 + #undef SLOAD_X4 + } + + return(0); +} diff --git a/kernel/mips/zasum_msa.c b/kernel/mips/zasum_msa.c new file mode 100644 index 000000000..c84d48ecb --- /dev/null +++ b/kernel/mips/zasum_msa.c @@ -0,0 +1,170 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include +#include "macros_msa.h" + +#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) + +#define PROCESS_ZD(inc_val) \ + if (n > 8) \ + { \ + n -= 8; \ + \ + LD_DP8_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5, src6, src7); \ + \ + sum_abs0 = AND_VEC_D(src0); \ + sum_abs1 = AND_VEC_D(src1); \ + sum_abs2 = AND_VEC_D(src2); \ + sum_abs3 = AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + sum_abs2 += AND_VEC_D(src6); \ + sum_abs3 += AND_VEC_D(src7); \ + } \ + else \ + { \ + sum_abs0 = zero_v; \ + sum_abs1 = zero_v; \ + sum_abs2 = zero_v; \ + sum_abs3 = zero_v; \ + } \ + \ + for (i = (n >> 3); i--;) \ + { \ + LD_DP8_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5, src6, src7); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + sum_abs2 += AND_VEC_D(src6); \ + sum_abs3 += AND_VEC_D(src7); \ + } \ + \ + if (n & 7) \ + { \ + if ((n & 4) && (n & 2) && (n & 1)) \ + { \ + LD_DP7_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5, src6); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + sum_abs2 += AND_VEC_D(src6); \ + } \ + else if ((n & 4) && (n & 2)) \ + { \ + LD_DP6_INC(x, inc_val, src0, src1, src2, \ + src3, src4, src5); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + sum_abs1 += AND_VEC_D(src5); \ + } \ + else if ((n & 4) && (n & 1)) \ + { \ + LD_DP5_INC(x, inc_val, src0, src1, src2, \ + src3, src4); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + sum_abs0 += AND_VEC_D(src4); \ + } \ + else if ((n & 2) && (n & 1)) \ + { \ + LD_DP3_INC(x, inc_val, src0, src1, src2); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + } \ + else if (n & 4) \ + { \ + LD_DP4_INC(x, inc_val, src0, src1, src2, \ + src3); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + sum_abs2 += AND_VEC_D(src2); \ + sum_abs3 += AND_VEC_D(src3); \ + } \ + else if (n & 2) \ + { \ + LD_DP2_INC(x, inc_val, src0, src1); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + sum_abs1 += AND_VEC_D(src1); \ + } \ + else if (n & 1) \ + { \ + src0 = LD_DP(x); \ + \ + sum_abs0 += AND_VEC_D(src0); \ + } \ + } \ + \ + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; \ + sumf = sum_abs0[0] + sum_abs0[1]; + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + FLOAT sumf = 0.0; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; + v2f64 zero_v = {0}; + v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; + + if (n <= 0 || inc_x <= 0) return (sumf); + + if (1 == inc_x) + { + PROCESS_ZD(2); + } + else + { + inc_x *= 2; + PROCESS_ZD(inc_x); + } + + return (sumf); +} diff --git a/kernel/mips/zdot_msa.c b/kernel/mips/zdot_msa.c new file mode 100644 index 000000000..b94509392 --- /dev/null +++ b/kernel/mips/zdot_msa.c @@ -0,0 +1,227 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP2 += + #define OP3 - + #define OP4 + +#else + #define OP2 -= + #define OP3 + + #define OP4 - +#endif + +#define DOT16_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); \ + \ + dot0 += (vx3r * vy3r); \ + dot0 OPR0## = (vx3i * vy3i); \ + dot1 OPR1## = (vx3i * vy3r); \ + dot1 += (vx3r * vy3i); + +#define DOT12_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); \ + \ + dot0 += (vx2r * vy2r); \ + dot0 OPR0## = (vx2i * vy2i); \ + dot1 OPR1## = (vx2i * vy2r); \ + dot1 += (vx2r * vy2i); + +#define DOT8_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); \ + \ + dot0 += (vx1r * vy1r); \ + dot0 OPR0## = (vx1i * vy1i); \ + dot1 OPR1## = (vx1i * vy1r); \ + dot1 += (vx1r * vy1i); + +#define DOT4_KERNEL(OPR0, OPR1) \ + dot0 += (vx0r * vy0r); \ + dot0 OPR0## = (vx0i * vy0i); \ + dot1 OPR1## = (vx0i * vy0r); \ + dot1 += (vx0r * vy0i); + +/* return double, x,y double */ +/* zdotc - CONJ */ +/* zdotu - !CONJ */ +#ifndef _MSC_VER +#include +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i = 0; + FLOAT dot[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; + v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; + v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; + v2f64 dot0 = {0, 0}; + v2f64 dot1 = {0, 0}; + v2f64 zero = {0, 0}; + openblas_complex_double result; + + dot[0] = 0.0; + dot[1] = 0.0; + + __real__(result) = 0.0; + __imag__(result) = 0.0; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + for (i = (n >> 3); i--;) + { + LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); + PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); + + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); + PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); + + #if !defined(CONJ) + DOT16_KERNEL(-, +); + #else + DOT16_KERNEL(+, -); + #endif + } + + if (n & 7) + { + if ((n & 4) && (n & 2)) + { + LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); + LD_DP2_INC(x, inc_x2, vx4, vx5); + LD_DP2_INC(y, inc_y2, vy4, vy5); + + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); + + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); + + #if !defined(CONJ) + DOT12_KERNEL(-, +); + #else + DOT12_KERNEL(+, -); + #endif + } + else if (n & 4) + { + LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); + + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + + #if !defined(CONJ) + DOT8_KERNEL(-, +); + #else + DOT8_KERNEL(+, -); + #endif + } + else if (n & 2) + { + LD_DP2_INC(x, inc_x2, vx0, vx1); + LD_DP2_INC(y, inc_y2, vy0, vy1); + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + + #if !defined(CONJ) + DOT4_KERNEL(-, +); + #else + DOT4_KERNEL(+, -); + #endif + } + + if (n & 1) + { + vx0 = LD_DP(x); + vy0 = LD_DP(y); + PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i); + + #if !defined(CONJ) + DOT4_KERNEL(-, +); + #else + DOT4_KERNEL(+, -); + #endif + } + } + + dot[0] += (dot0[0] + dot0[1]); + dot[1] += (dot1[0] + dot1[1]); + + __real__(result) = dot[0]; + __imag__(result) = dot[1]; + + return(result); +} diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c new file mode 100644 index 000000000..aadc610aa --- /dev/null +++ b/kernel/mips/zgemv_n_msa.c @@ -0,0 +1,667 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 + +#if !defined(XCONJ) + #define OP3 -= + #define OP4 += +#else + #define OP3 += + #define OP4 -= +#endif + +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif +#else + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 -= + #else + #define OP0 -= + #define OP1 -= + #define OP2 += + #endif +#endif + +#define ZGEMV_N_4x4() \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t3, t2, src1r, src1i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + PCKEVOD_D2_DP(t7, t6, src3r, src3i); \ + PCKEVOD_D2_DP(t9, t8, src4r, src4i); \ + PCKEVOD_D2_DP(t11, t10, src5r, src5i); \ + PCKEVOD_D2_DP(t13, t12, src6r, src6i); \ + PCKEVOD_D2_DP(t15, t14, src7r, src7i); \ + \ + y0r += tp0r * src0r; \ + y1r += tp0r * src1r; \ + y0r += tp1r * src2r; \ + y1r += tp1r * src3r; \ + y0r += tp2r * src4r; \ + y1r += tp2r * src5r; \ + y0r += tp3r * src6r; \ + y1r += tp3r * src7r; \ + \ + y0r OP0 tp0i * src0i; \ + y1r OP0 tp0i * src1i; \ + y0r OP0 tp1i * src2i; \ + y1r OP0 tp1i * src3i; \ + y0r OP0 tp2i * src4i; \ + y1r OP0 tp2i * src5i; \ + y0r OP0 tp3i * src6i; \ + y1r OP0 tp3i * src7i; \ + \ + y0i OP1 tp0r * src0i; \ + y1i OP1 tp0r * src1i; \ + y0i OP1 tp1r * src2i; \ + y1i OP1 tp1r * src3i; \ + y0i OP1 tp2r * src4i; \ + y1i OP1 tp2r * src5i; \ + y0i OP1 tp3r * src6i; \ + y1i OP1 tp3r * src7i; \ + \ + y0i OP2 tp0i * src0r; \ + y1i OP2 tp0i * src1r; \ + y0i OP2 tp1i * src2r; \ + y1i OP2 tp1i * src3r; \ + y0i OP2 tp2i * src4r; \ + y1i OP2 tp2i * src5r; \ + y0i OP2 tp3i * src6r; \ + y1i OP2 tp3i * src7r; \ + +#define ZGEMV_N_2x4() \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + PCKEVOD_D2_DP(t9, t8, src4r, src4i); \ + PCKEVOD_D2_DP(t13, t12, src6r, src6i); \ + \ + y0r += tp0r * src0r; \ + y0r += tp1r * src2r; \ + y0r += tp2r * src4r; \ + y0r += tp3r * src6r; \ + \ + y0r OP0 tp0i * src0i; \ + y0r OP0 tp1i * src2i; \ + y0r OP0 tp2i * src4i; \ + y0r OP0 tp3i * src6i; \ + \ + y0i OP1 tp0r * src0i; \ + y0i OP1 tp1r * src2i; \ + y0i OP1 tp2r * src4i; \ + y0i OP1 tp3r * src6i; \ + \ + y0i OP2 tp0i * src0r; \ + y0i OP2 tp1i * src2r; \ + y0i OP2 tp2i * src4r; \ + y0i OP2 tp3i * src6r; \ + +#define ZGEMV_N_1x4() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp0_r * pa0[k]; \ + res0 OP0 temp0_i * pa0[k + 1]; \ + res0 += temp1_r * pa1[k]; \ + res0 OP0 temp1_i * pa1[k + 1]; \ + res0 += temp2_r * pa2[k]; \ + res0 OP0 temp2_i * pa2[k + 1]; \ + res0 += temp3_r * pa3[k]; \ + res0 OP0 temp3_i * pa3[k + 1]; \ + \ + res1 OP1 temp0_r * pa0[k + 1]; \ + res1 OP2 temp0_i * pa0[k]; \ + res1 OP1 temp1_r * pa1[k + 1]; \ + res1 OP2 temp1_i * pa1[k]; \ + res1 OP1 temp2_r * pa2[k + 1]; \ + res1 OP2 temp2_i * pa2[k]; \ + res1 OP1 temp3_r * pa3[k + 1]; \ + res1 OP2 temp3_i * pa3[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define ZGEMV_N_4x2() \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t3, t2, src1r, src1i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + PCKEVOD_D2_DP(t7, t6, src3r, src3i); \ + \ + y0r += tp0r * src0r; \ + y1r += tp0r * src1r; \ + y0r += tp1r * src2r; \ + y1r += tp1r * src3r; \ + \ + y0r OP0 tp0i * src0i; \ + y1r OP0 tp0i * src1i; \ + y0r OP0 tp1i * src2i; \ + y1r OP0 tp1i * src3i; \ + \ + y0i OP1 tp0r * src0i; \ + y1i OP1 tp0r * src1i; \ + y0i OP1 tp1r * src2i; \ + y1i OP1 tp1r * src3i; \ + \ + y0i OP2 tp0i * src0r; \ + y1i OP2 tp0i * src1r; \ + y0i OP2 tp1i * src2r; \ + y1i OP2 tp1i * src3r; \ + +#define ZGEMV_N_2x2() \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + \ + y0r += tp0r * src0r; \ + y0r += tp1r * src2r; \ + \ + y0r OP0 tp0i * src0i; \ + y0r OP0 tp1i * src2i; \ + \ + y0i OP1 tp0r * src0i; \ + y0i OP1 tp1r * src2i; \ + \ + y0i OP2 tp0i * src0r; \ + y0i OP2 tp1i * src2r; \ + +#define ZGEMV_N_1x2() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp0_r * pa0[k]; \ + res0 OP0 temp0_i * pa0[k + 1]; \ + res0 += temp1_r * pa1[k]; \ + res0 OP0 temp1_i * pa1[k + 1]; \ + \ + res1 OP1 temp0_r * pa0[k + 1]; \ + res1 OP2 temp0_i * pa0[k]; \ + res1 OP1 temp1_r * pa1[k + 1]; \ + res1 OP2 temp1_i * pa1[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define ZGEMV_N_4x1() \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t3, t2, src1r, src1i); \ + \ + y0r += tp0r * src0r; \ + y1r += tp0r * src1r; \ + \ + y0r OP0 tp0i * src0i; \ + y1r OP0 tp0i * src1i; \ + \ + y0i OP1 tp0r * src0i; \ + y1i OP1 tp0r * src1i; \ + \ + y0i OP2 tp0i * src0r; \ + y1i OP2 tp0i * src1r; \ + +#define ZGEMV_N_2x1() \ + LD_DP2(pa0 + k, 2, t0, t1); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + \ + y0r += tp0r * src0r; \ + y0r OP0 tp0i * src0i; \ + y0i OP1 tp0r * src0i; \ + y0i OP2 tp0i * src0r; \ + +#define ZGEMV_N_1x1() \ + res0 = y[0 * inc_y2]; \ + res1 = y[0 * inc_y2 + 1]; \ + \ + res0 += temp0_r * pa0[k]; \ + res0 OP0 temp0_i * pa0[k + 1]; \ + \ + res1 OP1 temp0_r * pa0[k + 1]; \ + res1 OP2 temp0_i * pa0[k]; \ + \ + y[0 * inc_y2] = res0; \ + y[0 * inc_y2 + 1] = res1; \ + +#define ZLOAD_X4_SCALE_VECTOR() \ + LD_DP4(x, 2, x0, x1, x2, x3); \ + \ + PCKEVOD_D2_DP(x1, x0, x0r, x0i); \ + PCKEVOD_D2_DP(x3, x2, x1r, x1i); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + tp5r = alphar * x1r; \ + tp5r OP3 alphai * x1i; \ + tp5i = alphar * x1i; \ + tp5i OP4 alphai * x1r; \ + \ + SPLATI_D2_DP(tp4r, tp0r, tp1r); \ + SPLATI_D2_DP(tp5r, tp2r, tp3r); \ + SPLATI_D2_DP(tp4i, tp0i, tp1i); \ + SPLATI_D2_DP(tp5i, tp2i, tp3i); \ + +#define ZLOAD_X2_SCALE_VECTOR() \ + LD_DP2(x, 2, x0, x1); \ + \ + PCKEVOD_D2_DP(x1, x0, x0r, x0i); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + SPLATI_D2_DP(tp4r, tp0r, tp1r); \ + SPLATI_D2_DP(tp4i, tp0i, tp1i); \ + +#define ZLOAD_X4_SCALE_GP() \ + x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2))); \ + x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *)(x + 1 * inc_x2))); \ + x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 2 * inc_x2))); \ + x1r = (v2f64) __msa_insert_d((v2i64) x1r, 1, *((long long *)(x + 3 * inc_x2))); \ + x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2 + 1))); \ + x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *)(x + 1 * inc_x2 + 1))); \ + x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 2 * inc_x2 + 1))); \ + x1i = (v2f64) __msa_insert_d((v2i64) x1i, 1, *((long long *)(x + 3 * inc_x2 + 1))); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + tp5r = alphar * x1r; \ + tp5r OP3 alphai * x1i; \ + tp5i = alphar * x1i; \ + tp5i OP4 alphai * x1r; \ + \ + SPLATI_D2_DP(tp4r, tp0r, tp1r); \ + SPLATI_D2_DP(tp5r, tp2r, tp3r); \ + SPLATI_D2_DP(tp4i, tp0i, tp1i); \ + SPLATI_D2_DP(tp5i, tp2i, tp3i); \ + +#define ZLOAD_X2_SCALE_GP() \ + x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2))); \ + x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *)(x + 1 * inc_x2))); \ + x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2 + 1))); \ + x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *)(x + 1 * inc_x2 + 1))); \ + \ + tp4r = alphar * x0r; \ + tp4r OP3 alphai * x0i; \ + tp4i = alphar * x0i; \ + tp4i OP4 alphai * x0r; \ + \ + SPLATI_D2_DP(tp4r, tp0r, tp1r); \ + SPLATI_D2_DP(tp4i, tp0i, tp1i); \ + +#define ZLOAD_X1_SCALE_GP() \ + temp0_r = alpha_r * x[0 * inc_x2]; \ + temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ + temp0_i = alpha_r * x[0 * inc_x2 + 1]; \ + temp0_i OP4 alpha_i * x[0 * inc_x2]; \ + \ + tp0r = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_r); \ + tp0i = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_i); \ + +#define ZLOAD_Y4_VECTOR() \ + LD_DP4(y, 2, y0, y1, y2, y3); \ + PCKEVOD_D2_DP(y1, y0, y0r, y0i); \ + PCKEVOD_D2_DP(y3, y2, y1r, y1i); \ + +#define ZLOAD_Y2_VECTOR() \ + LD_DP2(y, 2, y0, y1); \ + PCKEVOD_D2_DP(y1, y0, y0r, y0i); \ + +#define ZSTORE_Y4_VECTOR() \ + ILVRL_D2_DP(y0i, y0r, y0, y1); \ + ILVRL_D2_DP(y1i, y1r, y2, y3); \ + ST_DP4(y0, y1, y2, y3, y, 2); \ + +#define ZSTORE_Y2_VECTOR() \ + ILVRL_D2_DP(y0i, y0r, y0, y1); \ + ST_DP2(y0, y1, y, 2); \ + +#define ZLOAD_Y4_GP() \ + y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2))); \ + y0r = (v2f64) __msa_insert_d((v2i64) y0r, 1, *((long long *)(y + 1 * inc_y2))); \ + y1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 2 * inc_y2))); \ + y1r = (v2f64) __msa_insert_d((v2i64) y1r, 1, *((long long *)(y + 3 * inc_y2))); \ + y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2 + 1))); \ + y0i = (v2f64) __msa_insert_d((v2i64) y0i, 1, *((long long *)(y + 1 * inc_y2 + 1))); \ + y1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 2 * inc_y2 + 1))); \ + y1i = (v2f64) __msa_insert_d((v2i64) y1i, 1, *((long long *)(y + 3 * inc_y2 + 1))); \ + +#define ZLOAD_Y2_GP() \ + y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2))); \ + y0r = (v2f64) __msa_insert_d((v2i64) y0r, 1, *((long long *)(y + 1 * inc_y2))); \ + y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2 + 1))); \ + y0i = (v2f64) __msa_insert_d((v2i64) y0i, 1, *((long long *)(y + 1 * inc_y2 + 1))); \ + +#define ZSTORE_Y4_GP() \ + *((long long *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0); \ + *((long long *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1); \ + *((long long *)(y + 2 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 0); \ + *((long long *)(y + 3 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 1); \ + *((long long *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0); \ + *((long long *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1); \ + *((long long *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 0); \ + *((long long *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 1); \ + +#define ZSTORE_Y2_GP() \ + *((long long *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0); \ + *((long long *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1); \ + *((long long *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0); \ + *((long long *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1); \ + +#define ZGEMV_N_MSA() \ + for (j = (n >> 2); j--;) \ + { \ + ZLOAD_X4_SCALE() \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_Y4() \ + ZGEMV_N_4x4() \ + ZSTORE_Y4() \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_Y2() \ + ZGEMV_N_2x4() \ + ZSTORE_Y2() \ + \ + k += 2 * 2; \ + y += inc_y2 * 2; \ + } \ + \ + if (m & 1) \ + { \ + temp0_r = tp4r[0]; \ + temp1_r = tp4r[1]; \ + temp2_r = tp5r[0]; \ + temp3_r = tp5r[1]; \ + \ + temp0_i = tp4i[0]; \ + temp1_i = tp4i[1]; \ + temp2_i = tp5i[0]; \ + temp3_i = tp5i[1]; \ + \ + ZGEMV_N_1x4() \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + \ + x += 4 * inc_x2; \ + } \ + \ + if (n & 2) \ + { \ + ZLOAD_X2_SCALE() \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_Y4() \ + ZGEMV_N_4x2() \ + ZSTORE_Y4() \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_Y2() \ + ZGEMV_N_2x2() \ + ZSTORE_Y2() \ + \ + k += 2 * 2; \ + y += inc_y2 * 2; \ + } \ + \ + if (m & 1) \ + { \ + temp0_r = tp4r[0]; \ + temp1_r = tp4r[1]; \ + \ + temp0_i = tp4i[0]; \ + temp1_i = tp4i[1]; \ + \ + ZGEMV_N_1x2() \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + \ + x += 2 * inc_x2; \ + } \ + \ + if (n & 1) \ + { \ + ZLOAD_X1_SCALE() \ + \ + k = 0; \ + y = y_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_Y4() \ + ZGEMV_N_4x1() \ + ZSTORE_Y4() \ + \ + k += 2 * 4; \ + y += inc_y2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_Y2() \ + ZGEMV_N_2x1() \ + ZSTORE_Y2() \ + \ + k += 2 * 2; \ + y += inc_y2 * 2; \ + } \ + \ + if (m & 1) \ + { \ + ZGEMV_N_1x1() \ + \ + k += 2; \ + y += inc_y2; \ + } \ + \ + pa0 += lda2; \ + x += inc_x2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, + BLASLONG inc_y2, FLOAT *buffer) +{ + BLASLONG i, j, k; + FLOAT *y_org = y; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i; + FLOAT temp3_i, res0, res1; + v2f64 alphar, alphai; + v2f64 x0, x1, x2, x3, y0, y1, y2, y3; + v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i; + + lda2 = 2 * lda2; + inc_x2 = 2 * inc_x2; + inc_y2 = 2 * inc_y2; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + alphar = COPY_DOUBLE_TO_VECTOR(alpha_r); + alphai = COPY_DOUBLE_TO_VECTOR(alpha_i); + + if ((2 == inc_x2) && (2 == inc_y2)) + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_VECTOR + #define ZLOAD_Y2 ZLOAD_Y2_VECTOR + #define ZSTORE_Y4 ZSTORE_Y4_VECTOR + #define ZSTORE_Y2 ZSTORE_Y2_VECTOR + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + else if (2 == inc_x2) + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_GP + #define ZLOAD_Y2 ZLOAD_Y2_GP + #define ZSTORE_Y4 ZSTORE_Y4_GP + #define ZSTORE_Y2 ZSTORE_Y2_GP + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + else if (2 == inc_y2) + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_VECTOR + #define ZLOAD_Y2 ZLOAD_Y2_VECTOR + #define ZSTORE_Y4 ZSTORE_Y4_VECTOR + #define ZSTORE_Y2 ZSTORE_Y2_VECTOR + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + else + { + #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP + #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP + #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP + #define ZLOAD_Y4 ZLOAD_Y4_GP + #define ZLOAD_Y2 ZLOAD_Y2_GP + #define ZSTORE_Y4 ZSTORE_Y4_GP + #define ZSTORE_Y2 ZSTORE_Y2_GP + + ZGEMV_N_MSA(); + + #undef ZLOAD_X4_SCALE + #undef ZLOAD_X2_SCALE + #undef ZLOAD_X1_SCALE + #undef ZLOAD_Y4 + #undef ZLOAD_Y2 + #undef ZSTORE_Y4 + #undef ZSTORE_Y2 + } + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c new file mode 100644 index 000000000..b2147b045 --- /dev/null +++ b/kernel/mips/zgemv_t_msa.c @@ -0,0 +1,544 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#undef OP0 +#undef OP1 +#undef OP2 +#undef OP3 +#undef OP4 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + #define OP0 -= + #define OP1 += + #define OP2 += +#else + #define OP0 += + #define OP1 += + #define OP2 -= +#endif + +#define ZGEMV_T_4x4() \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ + LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t3, t2, src1r, src1i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + PCKEVOD_D2_DP(t7, t6, src3r, src3i); \ + PCKEVOD_D2_DP(t9, t8, src4r, src4i); \ + PCKEVOD_D2_DP(t11, t10, src5r, src5i); \ + PCKEVOD_D2_DP(t13, t12, src6r, src6i); \ + PCKEVOD_D2_DP(t15, t14, src7r, src7i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp1r += src2r * x0r; \ + tp1r += src3r * x1r; \ + tp1r OP0 src2i * x0i; \ + tp1r OP0 src3i * x1i; \ + \ + tp2r += src4r * x0r; \ + tp2r += src5r * x1r; \ + tp2r OP0 src4i * x0i; \ + tp2r OP0 src5i * x1i; \ + \ + tp3r += src6r * x0r; \ + tp3r += src7r * x1r; \ + tp3r OP0 src6i * x0i; \ + tp3r OP0 src7i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP1 src3r * x1i; \ + tp1i OP2 src2i * x0r; \ + tp1i OP2 src3i * x1r; \ + \ + tp2i OP1 src4r * x0i; \ + tp2i OP1 src5r * x1i; \ + tp2i OP2 src4i * x0r; \ + tp2i OP2 src5i * x1r; \ + \ + tp3i OP1 src6r * x0i; \ + tp3i OP1 src7r * x1i; \ + tp3i OP2 src6i * x0r; \ + tp3i OP2 src7i * x1r; \ + +#define ZGEMV_T_4x2() \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t3, t2, src1r, src1i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + PCKEVOD_D2_DP(t7, t6, src3r, src3i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp1r += src2r * x0r; \ + tp1r += src3r * x1r; \ + tp1r OP0 src2i * x0i; \ + tp1r OP0 src3i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP1 src3r * x1i; \ + tp1i OP2 src2i * x0r; \ + tp1i OP2 src3i * x1r; \ + +#define ZGEMV_T_4x1() \ + LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t3, t2, src1r, src1i); \ + \ + tp0r += src0r * x0r; \ + tp0r += src1r * x1r; \ + tp0r OP0 src0i * x0i; \ + tp0r OP0 src1i * x1i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP1 src1r * x1i; \ + tp0i OP2 src0i * x0r; \ + tp0i OP2 src1i * x1r; \ + +#define ZGEMV_T_2x4() \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + LD_DP2(pa2 + k, 2, t8, t9); \ + LD_DP2(pa3 + k, 2, t12, t13); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + PCKEVOD_D2_DP(t9, t8, src4r, src4i); \ + PCKEVOD_D2_DP(t13, t12, src6r, src6i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp1r += src2r * x0r; \ + tp1r OP0 src2i * x0i; \ + \ + tp2r += src4r * x0r; \ + tp2r OP0 src4i * x0i; \ + \ + tp3r += src6r * x0r; \ + tp3r OP0 src6i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP2 src2i * x0r; \ + \ + tp2i OP1 src4r * x0i; \ + tp2i OP2 src4i * x0r; \ + \ + tp3i OP1 src6r * x0i; \ + tp3i OP2 src6i * x0r; \ + +#define ZGEMV_T_2x2() \ + LD_DP2(pa0 + k, 2, t0, t1); \ + LD_DP2(pa1 + k, 2, t4, t5); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + PCKEVOD_D2_DP(t5, t4, src2r, src2i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp1r += src2r * x0r; \ + tp1r OP0 src2i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + \ + tp1i OP1 src2r * x0i; \ + tp1i OP2 src2i * x0r; \ + +#define ZGEMV_T_2x1() \ + LD_DP2(pa0 + k, 2, t0, t1); \ + \ + PCKEVOD_D2_DP(t1, t0, src0r, src0i); \ + \ + tp0r += src0r * x0r; \ + tp0r OP0 src0i * x0i; \ + \ + tp0i OP1 src0r * x0i; \ + tp0i OP2 src0i * x0r; \ + +#define ZGEMV_T_1x4() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + temp1r += pa1[k + 0] * x[0 * inc_x2]; \ + temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ + temp2r += pa2[k + 0] * x[0 * inc_x2]; \ + temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \ + temp3r += pa3[k + 0] * x[0 * inc_x2]; \ + temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ + temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ + temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \ + temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \ + temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \ + temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \ + +#define ZGEMV_T_1x2() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + temp1r += pa1[k + 0] * x[0 * inc_x2]; \ + temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ + temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ + +#define ZGEMV_T_1x1() \ + temp0r += pa0[k + 0] * x[0 * inc_x2]; \ + temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ + \ + temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ + temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ + +#define ZSCALE_STORE_Y4_GP() \ + res0r = y[0 * inc_y2]; \ + res1r = y[1 * inc_y2]; \ + res2r = y[2 * inc_y2]; \ + res3r = y[3 * inc_y2]; \ + \ + res0i = y[0 * inc_y2 + 1]; \ + res1i = y[1 * inc_y2 + 1]; \ + res2i = y[2 * inc_y2 + 1]; \ + res3i = y[3 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + res1r += alphar * temp1r; \ + res1r OP0 alphai * temp1i; \ + res2r += alphar * temp2r; \ + res2r OP0 alphai * temp2i; \ + res3r += alphar * temp3r; \ + res3r OP0 alphai * temp3i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + res1i OP1 alphar * temp1i; \ + res1i OP2 alphai * temp1r; \ + res2i OP1 alphar * temp2i; \ + res2i OP2 alphai * temp2r; \ + res3i OP1 alphar * temp3i; \ + res3i OP2 alphai * temp3r; \ + \ + y[0 * inc_y2] = res0r; \ + y[1 * inc_y2] = res1r; \ + y[2 * inc_y2] = res2r; \ + y[3 * inc_y2] = res3r; \ + \ + y[0 * inc_y2 + 1] = res0i; \ + y[1 * inc_y2 + 1] = res1i; \ + y[2 * inc_y2 + 1] = res2i; \ + y[3 * inc_y2 + 1] = res3i; \ + +#define ZSCALE_STORE_Y2_GP() \ + res0r = y[0 * inc_y2]; \ + res1r = y[1 * inc_y2]; \ + \ + res0i = y[0 * inc_y2 + 1]; \ + res1i = y[1 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + res1r += alphar * temp1r; \ + res1r OP0 alphai * temp1i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + res1i OP1 alphar * temp1i; \ + res1i OP2 alphai * temp1r; \ + \ + y[0 * inc_y2] = res0r; \ + y[1 * inc_y2] = res1r; \ + \ + y[0 * inc_y2 + 1] = res0i; \ + y[1 * inc_y2 + 1] = res1i; \ + +#define ZSCALE_STORE_Y1_GP() \ + res0r = y[0 * inc_y2]; \ + res0i = y[0 * inc_y2 + 1]; \ + \ + res0r += alphar * temp0r; \ + res0r OP0 alphai * temp0i; \ + \ + res0i OP1 alphar * temp0i; \ + res0i OP2 alphai * temp0r; \ + \ + y[0 * inc_y2] = res0r; \ + y[0 * inc_y2 + 1] = res0i; \ + +#define ZLOAD_X4_VECTOR() \ + LD_DP4(x, 2, x0, x1, x2, x3); \ + PCKEVOD_D2_DP(x1, x0, x0r, x0i); \ + PCKEVOD_D2_DP(x3, x2, x1r, x1i); \ + +#define ZLOAD_X2_VECTOR() \ + LD_DP2(x, 2, x0, x1); \ + PCKEVOD_D2_DP(x1, x0, x0r, x0i); \ + +#define ZLOAD_X4_GP() \ + x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2))); \ + x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *) (x + 1 * inc_x2))); \ + x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2))); \ + x1r = (v2f64) __msa_insert_d((v2i64) x1r, 1, *((long long *) (x + 3 * inc_x2))); \ + x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1))); \ + x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *) (x + 1 * inc_x2 + 1))); \ + x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1))); \ + x1i = (v2f64) __msa_insert_d((v2i64) x1i, 1, *((long long *) (x + 3 * inc_x2 + 1))); \ + +#define ZLOAD_X2_GP() \ + x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2))); \ + x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *) (x + 1 * inc_x2))); \ + x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1))); \ + x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *) (x + 1 * inc_x2 + 1))); \ + +#define ZGEMV_T_MSA() \ + for (j = (n >> 2); j--;) \ + { \ + tp0r = tp1r = tp2r = tp3r = zero; \ + tp0i = tp1i = tp2i = tp3i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_X4(); \ + ZGEMV_T_4x4(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_X2(); \ + ZGEMV_T_2x4(); \ + \ + k += 2 * 2; \ + x += inc_x2 * 2; \ + } \ + \ + temp0r = tp0r[0] + tp0r[1]; \ + temp1r = tp1r[0] + tp1r[1]; \ + temp2r = tp2r[0] + tp2r[1]; \ + temp3r = tp3r[0] + tp3r[1]; \ + temp0i = tp0i[0] + tp0i[1]; \ + temp1i = tp1i[0] + tp1i[1]; \ + temp2i = tp2i[0] + tp2i[1]; \ + temp3i = tp3i[0] + tp3i[1]; \ + \ + if (m & 1) \ + { \ + ZGEMV_T_1x4(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + ZSCALE_STORE_Y4_GP(); \ + \ + pa0 += 4 * lda2; \ + pa1 += 4 * lda2; \ + pa2 += 4 * lda2; \ + pa3 += 4 * lda2; \ + y += 4 * inc_y2; \ + } \ + \ + if (n & 2) \ + { \ + tp0r = tp1r = zero; \ + tp0i = tp1i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_X4(); \ + ZGEMV_T_4x2(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_X2(); \ + ZGEMV_T_2x2(); \ + \ + k += 2 * 2; \ + x += inc_x2 * 2; \ + } \ + \ + temp0r = tp0r[0] + tp0r[1]; \ + temp1r = tp1r[0] + tp1r[1]; \ + temp0i = tp0i[0] + tp0i[1]; \ + temp1i = tp1i[0] + tp1i[1]; \ + \ + if (m & 1) \ + { \ + ZGEMV_T_1x2(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + ZSCALE_STORE_Y2_GP(); \ + \ + pa0 += 2 * lda2; \ + pa1 += 2 * lda2; \ + y += 2 * inc_y2; \ + } \ + \ + if (n & 1) \ + { \ + tp0r = zero; \ + tp0i = zero; \ + \ + k = 0; \ + x = srcx_org; \ + \ + for (i = (m >> 2); i--;) \ + { \ + ZLOAD_X4(); \ + ZGEMV_T_4x1(); \ + \ + k += 2 * 4; \ + x += inc_x2 * 4; \ + } \ + \ + if (m & 2) \ + { \ + ZLOAD_X2(); \ + ZGEMV_T_2x1(); \ + \ + k += 2 * 2; \ + x += inc_x2 * 2; \ + } \ + \ + temp0r = tp0r[0] + tp0r[1]; \ + temp0i = tp0i[0] + tp0i[1]; \ + \ + if (m & 1) \ + { \ + ZGEMV_T_1x1(); \ + \ + k += 2; \ + x += inc_x2; \ + } \ + \ + ZSCALE_STORE_Y1_GP(); \ + \ + pa0 += lda2; \ + y += inc_y2; \ + } \ + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, + FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG inc_x2, inc_y2, lda2; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *srcx_org = x; + FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i; + FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i; + v2f64 zero = {0}; + v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i; + v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; + v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; + v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; + + lda2 = 2 * lda; + + pa0 = A; + pa1 = A + lda2; + pa2 = A + 2 * lda2; + pa3 = A + 3 * lda2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if (2 == inc_x2) + { + #define ZLOAD_X4 ZLOAD_X4_VECTOR + #define ZLOAD_X2 ZLOAD_X2_VECTOR + + ZGEMV_T_MSA(); + + #undef ZLOAD_X4 + #undef ZLOAD_X2 + } + else + { + #define ZLOAD_X4 ZLOAD_X4_GP + #define ZLOAD_X2 ZLOAD_X2_GP + + ZGEMV_T_MSA(); + + #undef ZLOAD_X4 + #undef ZLOAD_X2 + } + + return(0); +} + +#undef OP0 +#undef OP1 +#undef OP2 diff --git a/kernel/mips64/KERNEL.P6600 b/kernel/mips64/KERNEL.P6600 new file mode 100644 index 000000000..abf44814a --- /dev/null +++ b/kernel/mips64/KERNEL.P6600 @@ -0,0 +1 @@ +include $(KERNELDIR)/../mips/KERNEL.P5600 diff --git a/param.h b/param.h index 1a0cc6180..555829d45 100644 --- a/param.h +++ b/param.h @@ -2174,7 +2174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(I6400) +#if defined(I6400) || defined(P6600) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2190,7 +2190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 - + #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 From 22b9c2747dec5d7b6749daaf6961b94efbdd07ed Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Fri, 22 Jul 2016 18:45:06 +0530 Subject: [PATCH 63/70] P6600/I6400 Build fix. Reverted the changes which was done to support for MIPS n32 ABI Signed-off-by: Shivraj Patil --- Makefile.system | 27 ++++++++++++++++++--------- c_check | 13 ++++--------- f_check | 7 +------ 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1c48a251c..e3abb5723 100644 --- a/Makefile.system +++ b/Makefile.system @@ -502,15 +502,12 @@ endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), $(filter $(ARCH),mips64)) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifdef BINARY64 CCOMMON_OPT += -mabi=64 else -CCOMMON_OPT += -mabi=n32 -endif -BINARY_DEFINED = 1 -else ifeq ($(ARCH), $(filter $(ARCH),mips)) CCOMMON_OPT += -mabi=32 +endif BINARY_DEFINED = 1 endif @@ -602,15 +599,13 @@ ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), $(filter $(ARCH),mips64)) +ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifdef BINARY64 FCOMMON_OPT += -mabi=64 else -FCOMMON_OPT += -mabi=n32 -endif -else ifeq ($(ARCH), $(filter $(ARCH),mips)) FCOMMON_OPT += -mabi=32 endif +endif else ifdef BINARY64 FCOMMON_OPT += -m64 @@ -693,6 +688,20 @@ endif endif endif +ifeq ($(filter $(ARCH),mips64 mips)) +ifndef BINARY64 +FCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif +else +ifdef BINARY64 +FCOMMON_OPT += -mabi=64 +else +FCOMMON_OPT += -mabi=32 +endif +endif + ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif diff --git a/c_check b/c_check index 9f457dfec..4569d842a 100644 --- a/c_check +++ b/c_check @@ -79,15 +79,10 @@ if ($os eq "AIX") { $defined = 1; } -if ($architecture eq "mips") { - $compiler_name .= " -mabi=32"; - $defined = 1; -} - -if ($architecture eq "mips64") { - $compiler_name .= " -mabi=n32" if ($binary eq "32"); - $compiler_name .= " -mabi=64" if ($binary eq "64"); - $defined = 1; +if (($architecture eq "mips") || ($architecture eq "mips64")) { + $compiler_name .= " -mabi=32" if ($binary eq "32"); + $compiler_name .= " -mabi=64" if ($binary eq "64"); + $defined = 1; } if (($architecture eq "arm") || ($architecture eq "arm64")) { diff --git a/f_check b/f_check index c7ad964e0..171c91f95 100644 --- a/f_check +++ b/f_check @@ -223,12 +223,7 @@ if (!$?) { } #For gfortran MIPS if ($?) { - $mips_data = `$compiler_bin -E -dM - < /dev/null`; - if ($mips_data =~ /_MIPS_ISA_MIPS64/) { - $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; - } else { - $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; - } + $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } $binary = "" if ($?); } From c54a29bb4837fa9f1c4be6159bf6cad96352e553 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 25 Jul 2016 14:33:25 +0530 Subject: [PATCH 64/70] Cortex A57: Improvements to DGEMM 8x4 kernel --- kernel/arm64/dgemm_kernel_8x4.S | 191 +++++++++++++++++++++++++------- 1 file changed, 151 insertions(+), 40 deletions(-) diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S index f3c3d5c35..3fd74fc3b 100644 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ldp q2, q3, [pCRow0] fmla v2.2d, v18.2d, alphaV0 @@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ldp q6, q7, [pCRow1] fmla v6.2d, v22.2d, alphaV0 @@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ldp q2, q3, [pCRow2] fmla v2.2d, v26.2d, alphaV0 @@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ldp q6, q7, [pCRow3] fmla v6.2d, v30.2d, alphaV0 @@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x4 fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] - add pCRow1, pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] - add pCRow2, pCRow1, LDC + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #32 ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] - add pCRow1, pCRow2, LDC + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow2, pCRow2, #32 - ld1 {v12.2d, v13.2d}, [pCRow1] + ld1 {v12.2d, v13.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 fmla v13.2d, v29.2d, alphaV0 - st1 {v12.2d, v13.2d}, [pCRow1] + st1 {v12.2d, v13.2d}, [pCRow3] - add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x4 fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] - add pCRow1, pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] - add pCRow2, pCRow1, LDC + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #16 ld1 {v8.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] - add pCRow1, pCRow2, LDC + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow2, pCRow2, #16 - ld1 {v12.2d}, [pCRow1] + ld1 {v12.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 - st1 {v12.2d}, [pCRow1] + st1 {v12.2d}, [pCRow3] - add pCRow0, pCRow0, #16 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x4 fmov alpha0, alpha - add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] @@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #8 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #8 ld1 {v12.d}[0], [pCRow2] - ld1 {v12.d}[1], [pCRow1] + ld1 {v12.d}[1], [pCRow3] fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] - st1 {v12.d}[1], [pCRow1] + st1 {v12.d}[1], [pCRow3] - add pCRow0, pCRow0, #8 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow2, pCRow2, #8 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] fmla v22.2d, v2.2d, v8.d[1] @@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE8x2 fmov alpha0, alpha - add pCRow1, pCRow0, LDC ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 @@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #64 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 @@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] - add pCRow0, pCRow0, #64 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #64 .endm /******************************************************************************/ @@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x2 fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] - add pCRow1, pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] - add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #32 .endm /******************************************************************************/ @@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x2 fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] - add pCRow1 , pCRow0, LDC + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] - add pCRow0, pCRow0, #16 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #16 .endm /******************************************************************************/ @@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x2 fmov alpha0, alpha - add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] @@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #8 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow1, pCRow1, #8 .endm /******************************************************************************/ @@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 fmov alpha0, alpha + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 @@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #64 .endm @@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x1 fmov alpha0, alpha + ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #32 .endm @@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x1 fmov alpha0, alpha + ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #16 .endm @@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmadd d8, d16, alpha0, d8 str d8, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #8 .endm @@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ + .align 5 dgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20: cmp counterL , #0 ble dgemm_kernel_L4_M4_40 + .align 5 dgemm_kernel_L4_M4_22: KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_22 @@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40: dgemm_kernel_L4_M4_42: KERNEL4x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_42 @@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100: dgemm_kernel_L4_M4_END: - dgemm_kernel_L4_M2_BEGIN: mov counterI, origM @@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20: cmp counterL , #0 ble dgemm_kernel_L4_M2_40 + .align 5 dgemm_kernel_L4_M2_22: KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB subs counterL, counterL, #1 @@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M2_100 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] dgemm_kernel_L4_M2_42: KERNEL2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_42 @@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20: cmp counterL , #0 ble dgemm_kernel_L4_M1_40 + .align 5 dgemm_kernel_L4_M1_22: KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB subs counterL, counterL, #1 @@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M1_100 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] dgemm_kernel_L4_M1_42: KERNEL1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_42 @@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction tst counterJ , #2 ble dgemm_kernel_L1_BEGIN - mov pCRow0, pC // pCRow0 = pC + mov pCRow0, pC + add pCRow1, pCRow0, LDC - add pC,pC,LDC, lsl #1 + add pC, pCRow1, LDC mov pA, origPA // pA = A @@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L2_M4_BEGIN + .align 5 dgemm_kernel_L2_M8_20: INIT8x2 @@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M8_40 - .align 5 + .align 5 dgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M8_22 - dgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M8_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M8_42: KERNEL8x2_SUB @@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M4_40 - .align 5 + .align 5 dgemm_kernel_L2_M4_22: KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB subs counterL, counterL, #1 @@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M4_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M4_42: KERNEL4x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_42 @@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20: dgemm_kernel_L2_M2_22: KERNEL2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_22 - + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 @@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20: dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + KERNEL1x2_SUB KERNEL1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_22 - + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 @@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L1_M4_BEGIN + .align 5 dgemm_kernel_L1_M8_20: INIT8x1 @@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M8_40 - .align 5 + .align 5 dgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M8_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M8_42: KERNEL8x1_SUB @@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M4_40 - .align 5 + .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB subs counterL, counterL, #1 @@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M4_100 + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M4_42: KERNEL4x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_42 @@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNEL2x1_SUB KERNEL2x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_22 - + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 @@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20: cmp counterL , #0 ble dgemm_kernel_L1_M1_40 + dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB @@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M1_100 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M1_42: KERNEL1x1_SUB From d1c6469283987bb79c920586ab213abbbc24f2ab Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Mon, 8 Aug 2016 11:58:01 +0530 Subject: [PATCH 65/70] MIPS n32 ABI support, MSA support detection and rename ARCH, ARCHFLAGS Signed-off-by: Shivraj Patil --- Makefile | 4 +- Makefile.prebuild | 16 +++- Makefile.system | 44 ++++----- c_check | 38 +++++++- f_check | 7 +- kernel/Makefile | 1 + kernel/mips/KERNEL.P5600 | 96 ++++++++++++++++++- lapack-netlib/BLAS/SRC/Makefile | 10 +- lapack-netlib/CBLAS/Makefile.in | 4 +- lapack-netlib/CBLAS/src/Makefile | 32 +++---- lapack-netlib/DOCS/lawn81.tex | 2 +- lapack-netlib/INSTALL/make.inc.ALPHA | 6 +- lapack-netlib/INSTALL/make.inc.HPPA | 6 +- lapack-netlib/INSTALL/make.inc.IRIX64 | 6 +- lapack-netlib/INSTALL/make.inc.O2K | 6 +- lapack-netlib/INSTALL/make.inc.SGI5 | 6 +- lapack-netlib/INSTALL/make.inc.SUN4 | 6 +- lapack-netlib/INSTALL/make.inc.SUN4SOL2 | 6 +- lapack-netlib/INSTALL/make.inc.XLF | 6 +- lapack-netlib/INSTALL/make.inc.gfortran | 6 +- lapack-netlib/INSTALL/make.inc.gfortran_debug | 6 +- lapack-netlib/INSTALL/make.inc.ifort | 6 +- lapack-netlib/INSTALL/make.inc.pgf95 | 6 +- lapack-netlib/INSTALL/make.inc.pghpf | 6 +- lapack-netlib/LAPACKE/src/Makefile | 4 +- lapack-netlib/LAPACKE/utils/Makefile | 2 +- lapack-netlib/SRC/Makefile | 10 +- lapack-netlib/SRC/VARIANTS/Makefile | 12 +-- lapack-netlib/TESTING/MATGEN/Makefile | 10 +- lapack-netlib/make.inc.example | 6 +- make.inc | 2 +- param.h | 48 +++------- 32 files changed, 266 insertions(+), 160 deletions(-) diff --git a/Makefile b/Makefile index 2ae004798..693808127 100644 --- a/Makefile +++ b/Makefile @@ -228,8 +228,8 @@ ifndef NOFORTRAN -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.prebuild b/Makefile.prebuild index ee0b67787..524f0a741 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -17,14 +17,26 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif +ifeq ($(TARGET), P5600) +TARGET_FLAGS = -mips32r5 +endif + +ifeq ($(TARGET), I6400) +TARGET_FLAGS = -mips64r6 +endif + +ifeq ($(TARGET), P6600) +TARGET_FLAGS = -mips64r6 +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) config.h : c_check f_check getarch - perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) + perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) ifneq ($(ONLY_CBLAS), 1) - perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) + perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) else #When we only build CBLAS, we set NOFORTRAN=2 echo "NOFORTRAN=2" >> $(TARGET_MAKE) diff --git a/Makefile.system b/Makefile.system index e3abb5723..47b197f6f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -159,7 +159,7 @@ ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -502,13 +502,16 @@ endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) +ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 CCOMMON_OPT += -mabi=64 else -CCOMMON_OPT += -mabi=32 +CCOMMON_OPT += -mabi=n32 endif BINARY_DEFINED = 1 +else ifeq ($(ARCH), $(filter $(ARCH),mips)) +CCOMMON_OPT += -mabi=32 +BINARY_DEFINED = 1 endif ifeq ($(CORE), LOONGSON3A) @@ -522,13 +525,18 @@ FCOMMON_OPT += -march=mips64 endif ifeq ($(CORE), P5600) -CCOMMON_OPT += -mmsa -FCOMMON_OPT += -mmsa +CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MIPS_SIMD_FLAGS) +FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MIPS_SIMD_FLAGS) +endif + +ifeq ($(CORE), I6400) +CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MIPS_SIMD_FLAGS) +FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MIPS_SIMD_FLAGS) endif -ifneq ($(filter $(CORE), I6400 P6600),) -CCOMMON_OPT += -mmsa -FCOMMON_OPT += -mmsa +ifeq ($(CORE), P6600) +CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MIPS_SIMD_FLAGS) +FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MIPS_SIMD_FLAGS) endif ifeq ($(OSNAME), AIX) @@ -599,12 +607,14 @@ ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran endif ifdef NO_BINARY_MODE -ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) +ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 FCOMMON_OPT += -mabi=64 else -FCOMMON_OPT += -mabi=32 +FCOMMON_OPT += -mabi=n32 endif +else ifeq ($(ARCH), $(filter $(ARCH),mips)) +FCOMMON_OPT += -mabi=32 endif else ifdef BINARY64 @@ -687,20 +697,6 @@ FCOMMON_OPT += -i8 endif endif endif - -ifeq ($(filter $(ARCH),mips64 mips)) -ifndef BINARY64 -FCOMMON_OPT += -m32 -else -FCOMMON_OPT += -m64 -endif -else -ifdef BINARY64 -FCOMMON_OPT += -mabi=64 -else -FCOMMON_OPT += -mabi=32 -endif -endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp diff --git a/c_check b/c_check index 4569d842a..a7c7aac2c 100644 --- a/c_check +++ b/c_check @@ -1,6 +1,7 @@ #!/usr/bin/perl use File::Basename; +use File::Temp qw(tempfile); # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); @@ -10,6 +11,7 @@ $hostarch = "arm" if ($hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); +$tmpf = new File::Temp( UNLINK => 1 ); $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); @@ -79,10 +81,15 @@ if ($os eq "AIX") { $defined = 1; } -if (($architecture eq "mips") || ($architecture eq "mips64")) { - $compiler_name .= " -mabi=32" if ($binary eq "32"); - $compiler_name .= " -mabi=64" if ($binary eq "64"); - $defined = 1; +if ($architecture eq "mips") { + $compiler_name .= " -mabi=32"; + $defined = 1; +} + +if ($architecture eq "mips64") { + $compiler_name .= " -mabi=n32" if ($binary eq "32"); + $compiler_name .= " -mabi=64" if ($binary eq "64"); + $defined = 1; } if (($architecture eq "arm") || ($architecture eq "arm64")) { @@ -152,6 +159,26 @@ if ($?) { die 1; } +$mips_simd = 1; +if (($architecture eq "mips") || ($architecture eq "mips64")) { + $code = '"addvi.b $w0, $w1, 1"'; + $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + print $tmpf "#include \n\n"; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + + $ret = 0; + $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $ret = ($? >> 8); + } + unlink("$tmpf.o"); + if($ret != 0) { + $mips_simd = 0; + } +} + $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = power if ($data =~ /ARCH_POWER/); @@ -246,6 +273,8 @@ print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; +print MAKEFILE "MIPS_SIMD=1\n" if $mips_simd eq 1; +print MAKEFILE "MIPS_SIMD_FLAGS=$msa_flags\n" if $mips_simd eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; @@ -257,6 +286,7 @@ print CONFFILE "#define C_$compiler\t1\n"; print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; +print CONFFILE "#define MIPS_SIMD\t1\n" if $mips_simd eq 1; if ($os eq "LINUX") { diff --git a/f_check b/f_check index 171c91f95..2f01f1c44 100644 --- a/f_check +++ b/f_check @@ -223,7 +223,12 @@ if (!$?) { } #For gfortran MIPS if ($?) { - $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + $mips_data = `$compiler_bin -E -dM - < /dev/null`; + if ($mips_data =~ /_MIPS_ISA_MIPS64/) { + $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } else { + $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } } $binary = "" if ($?); } diff --git a/kernel/Makefile b/kernel/Makefile index a0a8fcd21..8237549f3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,6 +4,7 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +include $(TOPDIR)/Makefile.conf ifdef TARGET_CORE override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 7bf90c905..92c0b3c8a 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -30,10 +30,17 @@ IDMAXKERNEL = ../mips/imax.c ISMINKERNEL = ../mips/imin.c IDMINKERNEL = ../mips/imin.c +ifdef MIPS_SIMD SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c ZASUMKERNEL = ../mips/zasum_msa.c +else +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/asum.c +ZASUMKERNEL = ../mips/asum.c +endif SAXPYKERNEL = ../mips/axpy.c DAXPYKERNEL = ../mips/axpy.c @@ -45,10 +52,17 @@ DCOPYKERNEL = ../mips/copy.c CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c +ifdef MIPS_SIMD SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c ZDOTKERNEL = ../mips/zdot_msa.c +else +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c +endif SNRM2KERNEL = ../mips/nrm2.c DNRM2KERNEL = ../mips/nrm2.c @@ -70,22 +84,45 @@ DSWAPKERNEL = ../mips/swap.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c +ifdef MIPS_SIMD SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c CGEMVNKERNEL = ../mips/cgemv_n_msa.c ZGEMVNKERNEL = ../mips/zgemv_n_msa.c - +else +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c +endif + +ifdef MIPS_SIMD SGEMVTKERNEL = ../mips/sgemv_t_msa.c DGEMVTKERNEL = ../mips/dgemv_t_msa.c CGEMVTKERNEL = ../mips/cgemv_t_msa.c ZGEMVTKERNEL = ../mips/zgemv_t_msa.c - +else +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c +endif + +ifdef MIPS_SIMD SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o +else +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +endif +ifdef MIPS_SIMD DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -95,7 +132,15 @@ DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +else +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +endif +ifdef MIPS_SIMD CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -105,29 +150,72 @@ CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o +else +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +endif +ifdef MIPS_SIMD ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o +else +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +endif +ifdef MIPS_SIMD STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c - +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef MIPS_SIMD DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c - +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef MIPS_SIMD +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif +ifdef MIPS_SIMD +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif \ No newline at end of file diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile index 43dbfb749..6ab015603 100644 --- a/lapack-netlib/BLAS/SRC/Makefile +++ b/lapack-netlib/BLAS/SRC/Makefile @@ -138,26 +138,26 @@ ALLOBJ=$(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \ $(ZBLAS2) $(ZBLAS3) $(ALLBLAS) $(BLASLIB): $(ALLOBJ) - $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) + $(AR) $(ARFLAGS) $@ $(ALLOBJ) $(RANLIB) $@ single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \ + $(AR) $(ARFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \ $(SBLAS2) $(SBLAS3) $(RANLIB) $(BLASLIB) double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \ + $(AR) $(ARFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \ $(DBLAS2) $(DBLAS3) $(RANLIB) $(BLASLIB) complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(CBLAS1) $(CB1AUX) \ + $(AR) $(ARFLAGS) $(BLASLIB) $(CBLAS1) $(CB1AUX) \ $(ALLBLAS) $(CBLAS2) $(CBLAS3) $(RANLIB) $(BLASLIB) complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(ZBLAS1) $(ZB1AUX) \ + $(AR) $(ARFLAGS) $(BLASLIB) $(ZBLAS1) $(ZB1AUX) \ $(ALLBLAS) $(ZBLAS2) $(ZBLAS3) $(RANLIB) $(BLASLIB) diff --git a/lapack-netlib/CBLAS/Makefile.in b/lapack-netlib/CBLAS/Makefile.in index fe0143044..9528cc93e 100644 --- a/lapack-netlib/CBLAS/Makefile.in +++ b/lapack-netlib/CBLAS/Makefile.in @@ -44,6 +44,6 @@ FFLAGS = -O3 # Archive programs and flags #----------------------------------------------------------------------------- -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile index d5c73cbb0..fa1b03dc2 100644 --- a/lapack-netlib/CBLAS/src/Makefile +++ b/lapack-netlib/CBLAS/src/Makefile @@ -73,27 +73,27 @@ alev1 = $(slev1) $(dlev1) $(clev1) $(zlev1) $(sclev1) # Single real precision slib1: $(slev1) $(sclev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev1) $(sclev1) + $(AR) $(ARFLAGS) $(CBLASLIB) $(slev1) $(sclev1) $(RANLIB) $(CBLASLIB) # Double real precision dlib1: $(dlev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev1) + $(AR) $(ARFLAGS) $(CBLASLIB) $(dlev1) $(RANLIB) $(CBLASLIB) # Single complex precision clib1: $(clev1) $(sclev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev1) $(sclev1) + $(AR) $(ARFLAGS) $(CBLASLIB) $(clev1) $(sclev1) $(RANLIB) $(CBLASLIB) # Double complex precision zlib1: $(zlev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev1) + $(AR) $(ARFLAGS) $(CBLASLIB) $(zlev1) $(RANLIB) $(CBLASLIB) # All precisions all1: $(alev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev1) + $(AR) $(ARFLAGS) $(CBLASLIB) $(alev1) $(RANLIB) $(CBLASLIB) # @@ -146,27 +146,27 @@ alev2 = $(slev2) $(dlev2) $(clev2) $(zlev2) # Single real precision slib2: $(slev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev2) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(slev2) $(errhand) $(RANLIB) $(CBLASLIB) # Double real precision dlib2: $(dlev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev2) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(dlev2) $(errhand) $(RANLIB) $(CBLASLIB) # Single complex precision clib2: $(clev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev2) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(clev2) $(errhand) $(RANLIB) $(CBLASLIB) # Double complex precision zlib2: $(zlev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev2) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(zlev2) $(errhand) $(RANLIB) $(CBLASLIB) # All precisions all2: $(alev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev2) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(alev2) $(errhand) $(RANLIB) $(CBLASLIB) # # @@ -211,32 +211,32 @@ alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3) # Single real precision slib3: $(slev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev3) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(slev3) $(errhand) $(RANLIB) $(CBLASLIB) # Double real precision dlib3: $(dlev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev3) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(dlev3) $(errhand) $(RANLIB) $(CBLASLIB) # Single complex precision clib3: $(clev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev3) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(clev3) $(errhand) $(RANLIB) $(CBLASLIB) # Single complex precision zlib3: $(zlev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev3) $(errhand) + $(AR) $(ARFLAGS) $(CBLASLIB) $(zlev3) $(errhand) $(RANLIB) $(CBLASLIB) # All precisions all3: $(alev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev3) + $(AR) $(ARFLAGS) $(CBLASLIB) $(alev3) $(RANLIB) $(CBLASLIB) # All levels and precisions cblaslib: $(alev) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev) + $(AR) $(ARFLAGS) $(CBLASLIB) $(alev) $(RANLIB) $(CBLASLIB) FRC: diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex index 16efef768..84e12f80b 100644 --- a/lapack-netlib/DOCS/lawn81.tex +++ b/lapack-netlib/DOCS/lawn81.tex @@ -466,7 +466,7 @@ TIMER = EXT_ETIME Refer to the section~\ref{second} to get more information. -Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver, +Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver, archiver options, and ranlib for your machine. If your architecture does not require \texttt{ranlib} to be run after each archive command (as is the case with CRAY computers running UNICOS, Hewlett Packard diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA index 33353d2d0..d20fe70bc 100644 --- a/lapack-netlib/INSTALL/make.inc.ALPHA +++ b/lapack-netlib/INSTALL/make.inc.ALPHA @@ -52,9 +52,9 @@ CFLAGS = -O4 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA index 062e1a56b..f38e16bbc 100644 --- a/lapack-netlib/INSTALL/make.inc.HPPA +++ b/lapack-netlib/INSTALL/make.inc.HPPA @@ -52,9 +52,9 @@ CFLAGS = # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64 index c8b34e4ed..6ad48c2bc 100644 --- a/lapack-netlib/INSTALL/make.inc.IRIX64 +++ b/lapack-netlib/INSTALL/make.inc.IRIX64 @@ -55,9 +55,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K index 55b7de245..1ea0a1c8c 100644 --- a/lapack-netlib/INSTALL/make.inc.O2K +++ b/lapack-netlib/INSTALL/make.inc.O2K @@ -55,9 +55,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5 index dccfae333..8afd522b8 100644 --- a/lapack-netlib/INSTALL/make.inc.SGI5 +++ b/lapack-netlib/INSTALL/make.inc.SGI5 @@ -52,9 +52,9 @@ CFLAGS = -O4 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4 index dd5cfd41e..39b0136ac 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4 +++ b/lapack-netlib/INSTALL/make.inc.SUN4 @@ -52,9 +52,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 index eb71a386d..db4725845 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 +++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 @@ -56,9 +56,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF index 5824e8f15..2b05fbebb 100644 --- a/lapack-netlib/INSTALL/make.inc.XLF +++ b/lapack-netlib/INSTALL/make.inc.XLF @@ -53,9 +53,9 @@ CFLAGS = -O3 -qnosave # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran index 43986435c..27164c2c1 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran +++ b/lapack-netlib/INSTALL/make.inc.gfortran @@ -56,9 +56,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug index 294758f09..ceddfc665 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran_debug +++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug @@ -56,9 +56,9 @@ CFLAGS = -g # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort index 5fca5c47e..3e5b83daa 100644 --- a/lapack-netlib/INSTALL/make.inc.ifort +++ b/lapack-netlib/INSTALL/make.inc.ifort @@ -52,9 +52,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95 index aaddfa5bd..cfad5dfe3 100644 --- a/lapack-netlib/INSTALL/make.inc.pgf95 +++ b/lapack-netlib/INSTALL/make.inc.pgf95 @@ -52,9 +52,9 @@ CFLAGS = # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf index 782c16d76..3261da874 100644 --- a/lapack-netlib/INSTALL/make.inc.pghpf +++ b/lapack-netlib/INSTALL/make.inc.pghpf @@ -52,9 +52,9 @@ CFLAGS = # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 636ca35b6..3e10fff35 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -2207,8 +2207,8 @@ OBJ_FILES := $(C_FILES:.o=.o) all: ../../$(LAPACKELIB) ../../$(LAPACKELIB): $(ALLOBJA) $(ALLOBJB) $(ALLXOBJ) $(DEPRECATED) - $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJA) - $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJB) $(ALLXOBJ) $(DEPRECATED) + $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $(ALLOBJA) + $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $(ALLOBJB) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) ../../$(LAPACKELIB) .c.o: diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index 1d7856789..cd3de5c8b 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -186,7 +186,7 @@ OBJ = lapacke_cgb_nancheck.o \ all: lib lib: $(OBJ) - $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(OBJ) + $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $(OBJ) $(RANLIB) ../../$(LAPACKELIB) .c.o: diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 22799769a..257ff136a 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -455,26 +455,26 @@ endif all: ../$(LAPACKLIB) ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) - $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) + $(AR) $(ARFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) $@ single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ $(CXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ $(DXLASRC) $(DZLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ $(ZXLASRC) $(DZLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 42446eb55..6034aed60 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -35,27 +35,27 @@ QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil. all: cholrl choltop lucr lull lurec qrll cholrl: $(CHOLRL) - $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/cholrl.a $(CHOLRL) + $(AR) $(ARFLAGS) $(VARIANTSDIR)/cholrl.a $(CHOLRL) $(RANLIB) $(VARIANTSDIR)/cholrl.a choltop: $(CHOLTOP) - $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/choltop.a $(CHOLTOP) + $(AR) $(ARFLAGS) $(VARIANTSDIR)/choltop.a $(CHOLTOP) $(RANLIB) $(VARIANTSDIR)/choltop.a lucr: $(LUCR) - $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lucr.a $(LUCR) + $(AR) $(ARFLAGS) $(VARIANTSDIR)/lucr.a $(LUCR) $(RANLIB) $(VARIANTSDIR)/lucr.a lull: $(LULL) - $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lull.a $(LULL) + $(AR) $(ARFLAGS) $(VARIANTSDIR)/lull.a $(LULL) $(RANLIB) $(VARIANTSDIR)/lull.a lurec: $(LUREC) - $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lurec.a $(LUREC) + $(AR) $(ARFLAGS) $(VARIANTSDIR)/lurec.a $(LUREC) $(RANLIB) $(VARIANTSDIR)/lurec.a qrll: $(QRLL) - $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/qrll.a $(QRLL) + $(AR) $(ARFLAGS) $(VARIANTSDIR)/qrll.a $(QRLL) $(RANLIB) $(VARIANTSDIR)/qrll.a diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index ecd9aa5c1..0ba5f44ad 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -59,23 +59,23 @@ ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) ../../$(TMGLIB): $(ALLOBJ) - $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) + $(AR) $(ARFLAGS) $@ $(ALLOBJ) $(RANLIB) $@ single: $(SMATGEN) $(SCATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(SMATGEN) $(SCATGEN) + $(AR) $(ARFLAGS) ../../$(TMGLIB) $(SMATGEN) $(SCATGEN) $(RANLIB) ../../$(TMGLIB) complex: $(CMATGEN) $(SCATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(CMATGEN) $(SCATGEN) + $(AR) $(ARFLAGS) ../../$(TMGLIB) $(CMATGEN) $(SCATGEN) $(RANLIB) ../../$(TMGLIB) double: $(DMATGEN) $(DZATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(DMATGEN) $(DZATGEN) + $(AR) $(ARFLAGS) ../../$(TMGLIB) $(DMATGEN) $(DZATGEN) $(RANLIB) ../../$(TMGLIB) complex16: $(ZMATGEN) $(DZATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(ZMATGEN) $(DZATGEN) + $(AR) $(ARFLAGS) ../../$(TMGLIB) $(ZMATGEN) $(DZATGEN) $(RANLIB) ../../$(TMGLIB) $(SCATGEN): $(FRC) diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example index 504a16421..0379069bb 100644 --- a/lapack-netlib/make.inc.example +++ b/lapack-netlib/make.inc.example @@ -57,9 +57,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS= cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/make.inc b/make.inc index 1fc95b0c6..a05d3f42e 100644 --- a/make.inc +++ b/make.inc @@ -1,6 +1,6 @@ SHELL = /bin/sh PLAT = _LINUX DRVOPTS = $(NOOPT) -ARCHFLAGS= -ru +ARFLAGS= -ru #RANLIB = ranlib diff --git a/param.h b/param.h index 555829d45..9f131eb77 100644 --- a/param.h +++ b/param.h @@ -2174,7 +2174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(I6400) || defined(P6600) +#if defined(P5600) || defined(I6400) || defined(P6600) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2182,6 +2182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#ifdef MIPS_SIMD #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2193,46 +2194,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 +#else +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 96 -#define ZGEMM_DEFAULT_P 64 - -#define SGEMM_DEFAULT_Q 240 -#define DGEMM_DEFAULT_Q 120 -#define CGEMM_DEFAULT_Q 120 -#define ZGEMM_DEFAULT_Q 120 - -#define SGEMM_DEFAULT_R 12288 -#define DGEMM_DEFAULT_R 8192 -#define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 -#define SYMV_P 16 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #endif -#if defined(P5600) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 - #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 @@ -2248,7 +2223,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 - #define SYMV_P 16 #endif From 96874379286efb14857c3f019c51743893cb7413 Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Wed, 10 Aug 2016 17:44:22 +0530 Subject: [PATCH 66/70] MIPS n32 ABI and build time mips simd support check Signed-off-by: Shivraj Patil --- Makefile | 4 +-- Makefile.system | 14 ++++---- c_check | 34 +++++++++---------- kernel/Makefile | 1 - kernel/mips/KERNEL.P5600 | 24 ++++++------- lapack-netlib/BLAS/SRC/Makefile | 10 +++--- lapack-netlib/CBLAS/Makefile.in | 4 +-- lapack-netlib/CBLAS/src/Makefile | 32 ++++++++--------- lapack-netlib/DOCS/lawn81.tex | 2 +- lapack-netlib/INSTALL/make.inc.ALPHA | 6 ++-- lapack-netlib/INSTALL/make.inc.HPPA | 6 ++-- lapack-netlib/INSTALL/make.inc.IRIX64 | 6 ++-- lapack-netlib/INSTALL/make.inc.O2K | 6 ++-- lapack-netlib/INSTALL/make.inc.SGI5 | 6 ++-- lapack-netlib/INSTALL/make.inc.SUN4 | 6 ++-- lapack-netlib/INSTALL/make.inc.SUN4SOL2 | 6 ++-- lapack-netlib/INSTALL/make.inc.XLF | 6 ++-- lapack-netlib/INSTALL/make.inc.gfortran | 6 ++-- lapack-netlib/INSTALL/make.inc.gfortran_debug | 6 ++-- lapack-netlib/INSTALL/make.inc.ifort | 6 ++-- lapack-netlib/INSTALL/make.inc.pgf95 | 6 ++-- lapack-netlib/INSTALL/make.inc.pghpf | 6 ++-- lapack-netlib/LAPACKE/src/Makefile | 4 +-- lapack-netlib/LAPACKE/utils/Makefile | 2 +- lapack-netlib/SRC/Makefile | 10 +++--- lapack-netlib/SRC/VARIANTS/Makefile | 12 +++---- lapack-netlib/TESTING/MATGEN/Makefile | 10 +++--- lapack-netlib/make.inc.example | 6 ++-- make.inc | 2 +- param.h | 2 +- 30 files changed, 125 insertions(+), 126 deletions(-) diff --git a/Makefile b/Makefile index 693808127..2ae004798 100644 --- a/Makefile +++ b/Makefile @@ -228,8 +228,8 @@ ifndef NOFORTRAN -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.system b/Makefile.system index 47b197f6f..b05177b6c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -525,18 +525,18 @@ FCOMMON_OPT += -march=mips64 endif ifeq ($(CORE), P5600) -CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MIPS_SIMD_FLAGS) -FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MIPS_SIMD_FLAGS) +CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) endif ifeq ($(CORE), I6400) -CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MIPS_SIMD_FLAGS) -FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MIPS_SIMD_FLAGS) +CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) +FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) endif ifeq ($(CORE), P6600) -CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MIPS_SIMD_FLAGS) -FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MIPS_SIMD_FLAGS) +CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) +FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) endif ifeq ($(OSNAME), AIX) @@ -1132,6 +1132,8 @@ export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 export HAVE_NEON +export HAVE_MSA +export MSA_FLAGS export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/c_check b/c_check index a7c7aac2c..2ec9fc484 100644 --- a/c_check +++ b/c_check @@ -159,24 +159,22 @@ if ($?) { die 1; } -$mips_simd = 1; +$have_msa = 0; if (($architecture eq "mips") || ($architecture eq "mips64")) { - $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; - print $tmpf "#include \n\n"; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; - - $ret = 0; - $args = "$msa_flags -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); + $code = '"addvi.b $w0, $w1, 1"'; + $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + print $tmpf "#include \n\n"; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + + $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); system(@cmd) == 0; - if ($? != 0) { - $ret = ($? >> 8); + if ($? != 0) { + $have_msa = 0; + } else { + $have_msa = 1; } - unlink("$tmpf.o"); - if($ret != 0) { - $mips_simd = 0; - } + unlink("$tmpf.o"); } $architecture = x86 if ($data =~ /ARCH_X86/); @@ -273,8 +271,8 @@ print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; -print MAKEFILE "MIPS_SIMD=1\n" if $mips_simd eq 1; -print MAKEFILE "MIPS_SIMD_FLAGS=$msa_flags\n" if $mips_simd eq 1; +print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; +print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; @@ -286,7 +284,7 @@ print CONFFILE "#define C_$compiler\t1\n"; print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; -print CONFFILE "#define MIPS_SIMD\t1\n" if $mips_simd eq 1; +print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; if ($os eq "LINUX") { diff --git a/kernel/Makefile b/kernel/Makefile index 8237549f3..a0a8fcd21 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,7 +4,6 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system -include $(TOPDIR)/Makefile.conf ifdef TARGET_CORE override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 92c0b3c8a..683579221 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -30,7 +30,7 @@ IDMAXKERNEL = ../mips/imax.c ISMINKERNEL = ../mips/imin.c IDMINKERNEL = ../mips/imin.c -ifdef MIPS_SIMD +ifdef HAVE_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c @@ -52,7 +52,7 @@ DCOPYKERNEL = ../mips/copy.c CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c -ifdef MIPS_SIMD +ifdef HAVE_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -84,7 +84,7 @@ DSWAPKERNEL = ../mips/swap.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c -ifdef MIPS_SIMD +ifdef HAVE_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c CGEMVNKERNEL = ../mips/cgemv_n_msa.c @@ -96,7 +96,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c ZGEMVNKERNEL = ../mips/zgemv_n.c endif -ifdef MIPS_SIMD +ifdef HAVE_MSA SGEMVTKERNEL = ../mips/sgemv_t_msa.c DGEMVTKERNEL = ../mips/dgemv_t_msa.c CGEMVTKERNEL = ../mips/cgemv_t_msa.c @@ -108,7 +108,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c endif -ifdef MIPS_SIMD +ifdef HAVE_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -122,7 +122,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o endif -ifdef MIPS_SIMD +ifdef HAVE_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -140,7 +140,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o endif -ifdef MIPS_SIMD +ifdef HAVE_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -158,7 +158,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o endif -ifdef MIPS_SIMD +ifdef HAVE_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -172,7 +172,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o endif -ifdef MIPS_SIMD +ifdef HAVE_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -184,7 +184,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef MIPS_SIMD +ifdef HAVE_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -196,7 +196,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef MIPS_SIMD +ifdef HAVE_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -208,7 +208,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef MIPS_SIMD +ifdef HAVE_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile index 6ab015603..43dbfb749 100644 --- a/lapack-netlib/BLAS/SRC/Makefile +++ b/lapack-netlib/BLAS/SRC/Makefile @@ -138,26 +138,26 @@ ALLOBJ=$(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \ $(ZBLAS2) $(ZBLAS3) $(ALLBLAS) $(BLASLIB): $(ALLOBJ) - $(AR) $(ARFLAGS) $@ $(ALLOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(RANLIB) $@ single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \ $(SBLAS2) $(SBLAS3) $(RANLIB) $(BLASLIB) double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \ $(DBLAS2) $(DBLAS3) $(RANLIB) $(BLASLIB) complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $(CBLAS1) $(CB1AUX) \ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(CBLAS1) $(CB1AUX) \ $(ALLBLAS) $(CBLAS2) $(CBLAS3) $(RANLIB) $(BLASLIB) complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $(ZBLAS1) $(ZB1AUX) \ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $(ZBLAS1) $(ZB1AUX) \ $(ALLBLAS) $(ZBLAS2) $(ZBLAS3) $(RANLIB) $(BLASLIB) diff --git a/lapack-netlib/CBLAS/Makefile.in b/lapack-netlib/CBLAS/Makefile.in index 9528cc93e..fe0143044 100644 --- a/lapack-netlib/CBLAS/Makefile.in +++ b/lapack-netlib/CBLAS/Makefile.in @@ -44,6 +44,6 @@ FFLAGS = -O3 # Archive programs and flags #----------------------------------------------------------------------------- -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile index fa1b03dc2..d5c73cbb0 100644 --- a/lapack-netlib/CBLAS/src/Makefile +++ b/lapack-netlib/CBLAS/src/Makefile @@ -73,27 +73,27 @@ alev1 = $(slev1) $(dlev1) $(clev1) $(zlev1) $(sclev1) # Single real precision slib1: $(slev1) $(sclev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $(slev1) $(sclev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev1) $(sclev1) $(RANLIB) $(CBLASLIB) # Double real precision dlib1: $(dlev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $(dlev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev1) $(RANLIB) $(CBLASLIB) # Single complex precision clib1: $(clev1) $(sclev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $(clev1) $(sclev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev1) $(sclev1) $(RANLIB) $(CBLASLIB) # Double complex precision zlib1: $(zlev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $(zlev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev1) $(RANLIB) $(CBLASLIB) # All precisions all1: $(alev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $(alev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev1) $(RANLIB) $(CBLASLIB) # @@ -146,27 +146,27 @@ alev2 = $(slev2) $(dlev2) $(clev2) $(zlev2) # Single real precision slib2: $(slev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(slev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev2) $(errhand) $(RANLIB) $(CBLASLIB) # Double real precision dlib2: $(dlev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(dlev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev2) $(errhand) $(RANLIB) $(CBLASLIB) # Single complex precision clib2: $(clev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(clev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev2) $(errhand) $(RANLIB) $(CBLASLIB) # Double complex precision zlib2: $(zlev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(zlev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev2) $(errhand) $(RANLIB) $(CBLASLIB) # All precisions all2: $(alev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(alev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev2) $(errhand) $(RANLIB) $(CBLASLIB) # # @@ -211,32 +211,32 @@ alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3) # Single real precision slib3: $(slev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(slev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev3) $(errhand) $(RANLIB) $(CBLASLIB) # Double real precision dlib3: $(dlev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(dlev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev3) $(errhand) $(RANLIB) $(CBLASLIB) # Single complex precision clib3: $(clev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(clev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev3) $(errhand) $(RANLIB) $(CBLASLIB) # Single complex precision zlib3: $(zlev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(zlev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev3) $(errhand) $(RANLIB) $(CBLASLIB) # All precisions all3: $(alev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $(alev3) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev3) $(RANLIB) $(CBLASLIB) # All levels and precisions cblaslib: $(alev) - $(AR) $(ARFLAGS) $(CBLASLIB) $(alev) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev) $(RANLIB) $(CBLASLIB) FRC: diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex index 84e12f80b..16efef768 100644 --- a/lapack-netlib/DOCS/lawn81.tex +++ b/lapack-netlib/DOCS/lawn81.tex @@ -466,7 +466,7 @@ TIMER = EXT_ETIME Refer to the section~\ref{second} to get more information. -Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver, +Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver, archiver options, and ranlib for your machine. If your architecture does not require \texttt{ranlib} to be run after each archive command (as is the case with CRAY computers running UNICOS, Hewlett Packard diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA index d20fe70bc..33353d2d0 100644 --- a/lapack-netlib/INSTALL/make.inc.ALPHA +++ b/lapack-netlib/INSTALL/make.inc.ALPHA @@ -52,9 +52,9 @@ CFLAGS = -O4 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = ranlib +ARCH = ar +ARCHFLAGS= cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA index f38e16bbc..062e1a56b 100644 --- a/lapack-netlib/INSTALL/make.inc.HPPA +++ b/lapack-netlib/INSTALL/make.inc.HPPA @@ -52,9 +52,9 @@ CFLAGS = # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = echo +ARCH = ar +ARCHFLAGS= cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64 index 6ad48c2bc..c8b34e4ed 100644 --- a/lapack-netlib/INSTALL/make.inc.IRIX64 +++ b/lapack-netlib/INSTALL/make.inc.IRIX64 @@ -55,9 +55,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = echo +ARCH = ar +ARCHFLAGS= cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K index 1ea0a1c8c..55b7de245 100644 --- a/lapack-netlib/INSTALL/make.inc.O2K +++ b/lapack-netlib/INSTALL/make.inc.O2K @@ -55,9 +55,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = echo +ARCH = ar +ARCHFLAGS= cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5 index 8afd522b8..dccfae333 100644 --- a/lapack-netlib/INSTALL/make.inc.SGI5 +++ b/lapack-netlib/INSTALL/make.inc.SGI5 @@ -52,9 +52,9 @@ CFLAGS = -O4 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = echo +ARCH = ar +ARCHFLAGS= cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4 index 39b0136ac..dd5cfd41e 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4 +++ b/lapack-netlib/INSTALL/make.inc.SUN4 @@ -52,9 +52,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = ranlib +ARCH = ar +ARCHFLAGS= cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 index db4725845..eb71a386d 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 +++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 @@ -56,9 +56,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = echo +ARCH = ar +ARCHFLAGS= cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF index 2b05fbebb..5824e8f15 100644 --- a/lapack-netlib/INSTALL/make.inc.XLF +++ b/lapack-netlib/INSTALL/make.inc.XLF @@ -53,9 +53,9 @@ CFLAGS = -O3 -qnosave # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = ranlib +ARCH = ar +ARCHFLAGS= cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran index 27164c2c1..43986435c 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran +++ b/lapack-netlib/INSTALL/make.inc.gfortran @@ -56,9 +56,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = ranlib +ARCH = ar +ARCHFLAGS= cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug index ceddfc665..294758f09 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran_debug +++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug @@ -56,9 +56,9 @@ CFLAGS = -g # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = ranlib +ARCH = ar +ARCHFLAGS= cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort index 3e5b83daa..5fca5c47e 100644 --- a/lapack-netlib/INSTALL/make.inc.ifort +++ b/lapack-netlib/INSTALL/make.inc.ifort @@ -52,9 +52,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = ranlib +ARCH = ar +ARCHFLAGS= cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95 index cfad5dfe3..aaddfa5bd 100644 --- a/lapack-netlib/INSTALL/make.inc.pgf95 +++ b/lapack-netlib/INSTALL/make.inc.pgf95 @@ -52,9 +52,9 @@ CFLAGS = # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = echo +ARCH = ar +ARCHFLAGS= cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf index 3261da874..782c16d76 100644 --- a/lapack-netlib/INSTALL/make.inc.pghpf +++ b/lapack-netlib/INSTALL/make.inc.pghpf @@ -52,9 +52,9 @@ CFLAGS = # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = echo +ARCH = ar +ARCHFLAGS= cr +RANLIB = echo # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 3e10fff35..636ca35b6 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -2207,8 +2207,8 @@ OBJ_FILES := $(C_FILES:.o=.o) all: ../../$(LAPACKELIB) ../../$(LAPACKELIB): $(ALLOBJA) $(ALLOBJB) $(ALLXOBJ) $(DEPRECATED) - $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $(ALLOBJA) - $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $(ALLOBJB) $(ALLXOBJ) $(DEPRECATED) + $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJA) + $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJB) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) ../../$(LAPACKELIB) .c.o: diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index cd3de5c8b..1d7856789 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -186,7 +186,7 @@ OBJ = lapacke_cgb_nancheck.o \ all: lib lib: $(OBJ) - $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $(OBJ) + $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(OBJ) $(RANLIB) ../../$(LAPACKELIB) .c.o: diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 257ff136a..22799769a 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -455,26 +455,26 @@ endif all: ../$(LAPACKLIB) ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) - $(AR) $(ARFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) $@ single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ $(CXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ $(DXLASRC) $(DZLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ $(ZXLASRC) $(DZLAUX) $(ALLAUX) $(ALLXAUX) $(RANLIB) ../$(LAPACKLIB) diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 6034aed60..42446eb55 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -35,27 +35,27 @@ QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil. all: cholrl choltop lucr lull lurec qrll cholrl: $(CHOLRL) - $(AR) $(ARFLAGS) $(VARIANTSDIR)/cholrl.a $(CHOLRL) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/cholrl.a $(CHOLRL) $(RANLIB) $(VARIANTSDIR)/cholrl.a choltop: $(CHOLTOP) - $(AR) $(ARFLAGS) $(VARIANTSDIR)/choltop.a $(CHOLTOP) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/choltop.a $(CHOLTOP) $(RANLIB) $(VARIANTSDIR)/choltop.a lucr: $(LUCR) - $(AR) $(ARFLAGS) $(VARIANTSDIR)/lucr.a $(LUCR) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lucr.a $(LUCR) $(RANLIB) $(VARIANTSDIR)/lucr.a lull: $(LULL) - $(AR) $(ARFLAGS) $(VARIANTSDIR)/lull.a $(LULL) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lull.a $(LULL) $(RANLIB) $(VARIANTSDIR)/lull.a lurec: $(LUREC) - $(AR) $(ARFLAGS) $(VARIANTSDIR)/lurec.a $(LUREC) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/lurec.a $(LUREC) $(RANLIB) $(VARIANTSDIR)/lurec.a qrll: $(QRLL) - $(AR) $(ARFLAGS) $(VARIANTSDIR)/qrll.a $(QRLL) + $(ARCH) $(ARCHFLAGS) $(VARIANTSDIR)/qrll.a $(QRLL) $(RANLIB) $(VARIANTSDIR)/qrll.a diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index 0ba5f44ad..ecd9aa5c1 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -59,23 +59,23 @@ ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) ../../$(TMGLIB): $(ALLOBJ) - $(AR) $(ARFLAGS) $@ $(ALLOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(RANLIB) $@ single: $(SMATGEN) $(SCATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $(SMATGEN) $(SCATGEN) + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(SMATGEN) $(SCATGEN) $(RANLIB) ../../$(TMGLIB) complex: $(CMATGEN) $(SCATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $(CMATGEN) $(SCATGEN) + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(CMATGEN) $(SCATGEN) $(RANLIB) ../../$(TMGLIB) double: $(DMATGEN) $(DZATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $(DMATGEN) $(DZATGEN) + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(DMATGEN) $(DZATGEN) $(RANLIB) ../../$(TMGLIB) complex16: $(ZMATGEN) $(DZATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $(ZMATGEN) $(DZATGEN) + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $(ZMATGEN) $(DZATGEN) $(RANLIB) ../../$(TMGLIB) $(SCATGEN): $(FRC) diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example index 0379069bb..504a16421 100644 --- a/lapack-netlib/make.inc.example +++ b/lapack-netlib/make.inc.example @@ -57,9 +57,9 @@ CFLAGS = -O3 # The archiver and the flag(s) to use when building archive (library) # If you system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr -RANLIB = ranlib +ARCH = ar +ARCHFLAGS= cr +RANLIB = ranlib # # Location of the extended-precision BLAS (XBLAS) Fortran library # used for building and testing extended-precision routines. The diff --git a/make.inc b/make.inc index a05d3f42e..1fc95b0c6 100644 --- a/make.inc +++ b/make.inc @@ -1,6 +1,6 @@ SHELL = /bin/sh PLAT = _LINUX DRVOPTS = $(NOOPT) -ARFLAGS= -ru +ARCHFLAGS= -ru #RANLIB = ranlib diff --git a/param.h b/param.h index 9f131eb77..480518cd4 100644 --- a/param.h +++ b/param.h @@ -2182,7 +2182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#ifdef MIPS_SIMD +#ifdef HAVE_MSA #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 From 78348a285350425ff3baf782f8f37906e7c6ddc6 Mon Sep 17 00:00:00 2001 From: "nishidha@us.ibm.com" Date: Thu, 11 Aug 2016 14:43:26 +0530 Subject: [PATCH 67/70] Added support of IBM's MASS library that optimizes performance on Power architectures --- Makefile.power | 10 ++++++++++ README.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/Makefile.power b/Makefile.power index 589d67441..79db83751 100644 --- a/Makefile.power +++ b/Makefile.power @@ -38,6 +38,16 @@ else endif endif +#Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library +#USE_MASS = 1 + +ifeq ($(USE_MASS), 1) +# Path to MASS libs, change it if the libs are installed at any other location +MASSPATH = /opt/ibm/xlmass/8.1.3/lib +COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS +EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8 +endif + ifdef BINARY64 diff --git a/README.md b/README.md index 8ac88840a..ff55edaa1 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6 make DEBUG=1 +### Compile with MASS Support on Power CPU (Optional dependency) + +[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and +Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. +The library can be installed as below - + + * On Ubuntu: + + wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - + echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list + sudo apt-get update + sudo apt-get install libxlmass-devel.8.1.3 + + * On RHEL/CentOS: + + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key + sudo rpm --import repomd.xml.key + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo + sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ + sudo yum install libxlmass-devel.8.1.3 + +After installing MASS library, compile openblas with USE_MASS=1. + +Example: + +Compiling on Power8 with MASS support - + + make USE_MASS=1 TARGET=POWER8 + ### Install to the directory (optional) Example: From ae70b916f476e9ce50e23a0286a7e7d7e60bc436 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 18 Aug 2016 10:24:42 -0700 Subject: [PATCH 68/70] Refs #929. Deal with zero and NaNs for scale. --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zscal.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 323b67d05..dacff7ab2 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -137,7 +137,7 @@ DROTKERNEL = drot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c -#CSCALKERNEL = ../arm/zscal.c +CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c # SSWAPKERNEL = sswap.c diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 213839a8f..410fc9840 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) +#if defined(DOUBLE) #include "zscal_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 @@ -123,6 +125,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F if ( inc_x <= 0 ) return(0); + if (da_r == ZERO && da_i == ZERO) { + //clear the vector and return + if (inc_x == 1) { + memset(x, 0, n*COMPSIZE*SIZE); + }else{ + inc_x2 = 2 * inc_x; + for(i=0; i Date: Thu, 18 Aug 2016 18:59:43 -0700 Subject: [PATCH 69/70] Refs #946. Use nrm2 reference implementation for Power8. --- kernel/power/KERNEL.POWER8 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index dacff7ab2..b9f44db91 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -125,10 +125,10 @@ DDOTKERNEL = ddot.c #CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c # -#SNRM2KERNEL = ../arm/nrm2.c -#DNRM2KERNEL = ../arm/nrm2.c -#CNRM2KERNEL = ../arm/znrm2.c -#ZNRM2KERNEL = ../arm/znrm2.c +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c From 821affb9a00f65d689055489c6672cadde40375b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 31 Aug 2016 23:58:29 -0400 Subject: [PATCH 70/70] Update doc for 0.2.19. --- CMakeLists.txt | 2 +- Changelog.txt | 18 ++++++++++++++++++ Makefile.rule | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d96140232..ff42643fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 19.dev) +set(OpenBLAS_PATCH_VERSION 19) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) diff --git a/Changelog.txt b/Changelog.txt index 7f82e8e88..2eb27ab04 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.19 +1-Sep-2016 +common: + * Improved cross compiling. + * Fix the bug on musl libc. + +POWER: + * Optimize BLAS on Power8 + * Fixed Julia+OpenBLAS bugs on Power8 + +MIPS: + * Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste) + +ARM: + * Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) + + ==================================================================== Version 0.2.18 12-Apr-2016 diff --git a/Makefile.rule b/Makefile.rule index 2d27237de..5bb9cf0b7 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.19.dev +VERSION = 0.2.19 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library