Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>tags/v0.2.19^2
| @@ -80,11 +80,6 @@ DGEMVTKERNEL = ../mips/gemv_t.c | |||
| CGEMVTKERNEL = ../mips/zgemv_t.c | |||
| ZGEMVTKERNEL = ../mips/zgemv_t.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||
| @@ -101,15 +96,19 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||
| CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c | |||
| CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| @@ -0,0 +1,195 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 dst0, dst1, dst4, dst5; | |||
| psrc0 = src; | |||
| pdst = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| src4 = LD_SP(psrc3); | |||
| src6 = LD_SP(psrc4); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| ctemp05 = *(psrc3 + 0); | |||
| ctemp06 = *(psrc3 + 1); | |||
| ctemp07 = *(psrc4 + 0); | |||
| ctemp08 = *(psrc4 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| *(pdst + 4) = ctemp05; | |||
| *(pdst + 5) = ctemp06; | |||
| *(pdst + 6) = ctemp07; | |||
| *(pdst + 7) = ctemp08; | |||
| pdst += 8; | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| pdst += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| psrc1 += 4; | |||
| ST_SP(src0, pdst); | |||
| pdst += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| psrc1 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| pdst += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,310 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||
| FLOAT *psrc8, *pdst; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07; | |||
| FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14; | |||
| FLOAT ctemp15, ctemp16; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||
| psrc0 = src; | |||
| pdst = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 3); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc5 = psrc4 + lda; | |||
| psrc6 = psrc5 + lda; | |||
| psrc7 = psrc6 + lda; | |||
| psrc8 = psrc7 + lda; | |||
| psrc0 += 8 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| LD_SP2_INC(psrc5, 4, src8, src9); | |||
| LD_SP2_INC(psrc6, 4, src10, src11); | |||
| LD_SP2_INC(psrc7, 4, src12, src13); | |||
| LD_SP2_INC(psrc8, 4, src14, src15); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_SP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_SP(src14, src12, dst3, dst7); | |||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||
| ILVRL_D2_SP(src11, src9, dst2, dst6); | |||
| ILVRL_D2_SP(src15, src13, dst3, dst7); | |||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| src4 = LD_SP(psrc3); | |||
| src6 = LD_SP(psrc4); | |||
| src8 = LD_SP(psrc5); | |||
| src10 = LD_SP(psrc6); | |||
| src12 = LD_SP(psrc7); | |||
| src14 = LD_SP(psrc8); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| psrc5 += 4; | |||
| psrc6 += 4; | |||
| psrc7 += 4; | |||
| psrc8 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_SP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_SP(src14, src12, dst3, dst7); | |||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| ctemp05 = *(psrc3 + 0); | |||
| ctemp06 = *(psrc3 + 1); | |||
| ctemp07 = *(psrc4 + 0); | |||
| ctemp08 = *(psrc4 + 1); | |||
| ctemp09 = *(psrc5 + 0); | |||
| ctemp10 = *(psrc5 + 1); | |||
| ctemp11 = *(psrc6 + 0); | |||
| ctemp12 = *(psrc6 + 1); | |||
| ctemp13 = *(psrc7 + 0); | |||
| ctemp14 = *(psrc7 + 1); | |||
| ctemp15 = *(psrc8 + 0); | |||
| ctemp16 = *(psrc8 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| psrc5 += 2; | |||
| psrc6 += 2; | |||
| psrc7 += 2; | |||
| psrc8 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| *(pdst + 4) = ctemp05; | |||
| *(pdst + 5) = ctemp06; | |||
| *(pdst + 6) = ctemp07; | |||
| *(pdst + 7) = ctemp08; | |||
| *(pdst + 8) = ctemp09; | |||
| *(pdst + 9) = ctemp10; | |||
| *(pdst + 10) = ctemp11; | |||
| *(pdst + 11) = ctemp12; | |||
| *(pdst + 12) = ctemp13; | |||
| *(pdst + 13) = ctemp14; | |||
| *(pdst + 14) = ctemp15; | |||
| *(pdst + 15) = ctemp16; | |||
| pdst += 16; | |||
| } | |||
| } | |||
| if (n & 4) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| src4 = LD_SP(psrc3); | |||
| src6 = LD_SP(psrc4); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| ctemp05 = *(psrc3 + 0); | |||
| ctemp06 = *(psrc3 + 1); | |||
| ctemp07 = *(psrc4 + 0); | |||
| ctemp08 = *(psrc4 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| *(pdst + 4) = ctemp05; | |||
| *(pdst + 5) = ctemp06; | |||
| *(pdst + 6) = ctemp07; | |||
| *(pdst + 7) = ctemp08; | |||
| pdst += 8; | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| pdst += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| psrc1 += 4; | |||
| ST_SP(src0, pdst); | |||
| pdst += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| psrc1 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| pdst += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,125 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0; | |||
| FLOAT *psrc1, *psrc2; | |||
| FLOAT *pdst0; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| v4f32 src0, src1, src2, src3; | |||
| psrc0 = src; | |||
| pdst0 = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 8; | |||
| for (i = (m >> 1); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 4; | |||
| for (i = (m >> 1); i--;) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src1 = LD_SP(psrc2); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| ST_SP(src0, pdst0); | |||
| pdst0 += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 2; | |||
| for (i = (m >> 1); i--;) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| pdst0 += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,214 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *pdst0; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| psrc0 = src; | |||
| pdst0 = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 3); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 16; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||
| LD_SP4(psrc2, 4, src4, src5, src6, src7); | |||
| LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11); | |||
| LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15); | |||
| ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); | |||
| ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4); | |||
| psrc1 += 4 * lda; | |||
| psrc2 += 4 * lda; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||
| LD_SP4(psrc2, 4, src4, src5, src6, src7); | |||
| ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| } | |||
| } | |||
| if (n & 4) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 8; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| LD_SP2(psrc1 + 2 * lda, 4, src4, src5); | |||
| LD_SP2(psrc2 + 2 * lda, 4, src6, src7); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| ST_SP4_INC(src4, src5, src6, src7, pdst0, 4); | |||
| psrc1 += 4 * lda; | |||
| psrc2 += 4 * lda; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 4; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src1 = LD_SP(psrc2); | |||
| src2 = LD_SP(psrc1 + 2 * lda); | |||
| src3 = LD_SP(psrc2 + 2 * lda); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| psrc1 += 4 * lda; | |||
| psrc2 += 4 * lda; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src1 = LD_SP(psrc2); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| ST_SP(src0, pdst0); | |||
| pdst0 += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 2; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| pdst0 += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -32,8 +32,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; | |||
| FLOAT *pdst; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||
| @@ -50,28 +49,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| LD_DP2(psrc3, 2, src4, src5); | |||
| LD_DP2(psrc4, 2, src6, src7); | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); | |||
| dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); | |||
| dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); | |||
| dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); | |||
| dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); | |||
| dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); | |||
| dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); | |||
| dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); | |||
| ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||
| pdst += 16; | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src3, src1, dst2, dst6); | |||
| ILVRL_D2_DP(src7, src5, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| @@ -91,18 +79,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); | |||
| dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); | |||
| dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); | |||
| dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src3, src1, dst1, dst5); | |||
| ST_DP4(dst0, dst4, dst1, dst5, pdst, 2); | |||
| pdst += 8; | |||
| ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2); | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| @@ -32,9 +32,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; | |||
| FLOAT *psrc5, *psrc6, *psrc7, *psrc8; | |||
| FLOAT *pdst; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||
| FLOAT *psrc8, *pdst; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||
| @@ -56,80 +55,51 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (m >> 3); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| LD_DP2(psrc3, 2, src4, src5); | |||
| LD_DP2(psrc4, 2, src6, src7); | |||
| LD_DP2(psrc5, 2, src8, src9); | |||
| LD_DP2(psrc6, 2, src10, src11); | |||
| LD_DP2(psrc7, 2, src12, src13); | |||
| LD_DP2(psrc8, 2, src14, src15); | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); | |||
| dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); | |||
| dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); | |||
| dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); | |||
| dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); | |||
| dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); | |||
| dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); | |||
| dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); | |||
| ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); | |||
| dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); | |||
| dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); | |||
| dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); | |||
| dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); | |||
| dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); | |||
| dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); | |||
| dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); | |||
| ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16, | |||
| 2); | |||
| LD_DP2(psrc1 + 4, 2, src0, src1); | |||
| LD_DP2(psrc2 + 4, 2, src2, src3); | |||
| LD_DP2(psrc3 + 4, 2, src4, src5); | |||
| LD_DP2(psrc4 + 4, 2, src6, src7); | |||
| LD_DP2(psrc5 + 4, 2, src8, src9); | |||
| LD_DP2(psrc6 + 4, 2, src10, src11); | |||
| LD_DP2(psrc7 + 4, 2, src12, src13); | |||
| LD_DP2(psrc8 + 4, 2, src14, src15); | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); | |||
| dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); | |||
| dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); | |||
| dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); | |||
| dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); | |||
| dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); | |||
| dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); | |||
| dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); | |||
| ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32, | |||
| 2); | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); | |||
| dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); | |||
| dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); | |||
| dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); | |||
| dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); | |||
| dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); | |||
| dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); | |||
| dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); | |||
| ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48, | |||
| 2); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| psrc3 += 8; | |||
| psrc4 += 8; | |||
| psrc5 += 8; | |||
| psrc6 += 8; | |||
| psrc7 += 8; | |||
| psrc8 += 8; | |||
| pdst += 64; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_DP(src14, src12, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| ILVRL_D2_DP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_DP(src7, src5, dst1, dst5); | |||
| ILVRL_D2_DP(src11, src9, dst2, dst6); | |||
| ILVRL_D2_DP(src15, src13, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_DP(src14, src12, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| ILVRL_D2_DP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_DP(src7, src5, dst1, dst5); | |||
| ILVRL_D2_DP(src11, src9, dst2, dst6); | |||
| ILVRL_D2_DP(src15, src13, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| } | |||
| for (i = (m & 7); i--;) | |||
| @@ -155,27 +125,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| LD_DP2(psrc3, 2, src4, src5); | |||
| LD_DP2(psrc4, 2, src6, src7); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); | |||
| dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); | |||
| dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); | |||
| dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); | |||
| dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); | |||
| dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); | |||
| dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); | |||
| dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); | |||
| ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||
| pdst += 16; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src3, src1, dst2, dst6); | |||
| ILVRL_D2_DP(src7, src5, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| @@ -200,11 +160,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0); | |||
| dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0); | |||
| ILVRL_D2_DP(src1, src0, dst0, dst1); | |||
| ST_DP2(dst0, dst1, pdst, 2); | |||
| pdst += 4; | |||
| ST_DP2_INC(dst0, dst1, pdst, 2); | |||
| } | |||
| if (m & 1) | |||
| @@ -55,14 +55,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| LD_DP2(psrc3, 2, src4, src5); | |||
| LD_DP2(psrc4, 2, src6, src7); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| pdst1 += m * 4; | |||
| @@ -79,8 +75,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| ST_DP4(src0, src1, src2, src3, pdst2, 2); | |||
| pdst2 += 8; | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); | |||
| } | |||
| if (n & 1) | |||
| @@ -103,10 +98,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| ST_DP4(src0, src1, src2, src3, pdst1, 2); | |||
| pdst1 += m * 4; | |||
| @@ -119,8 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| ST_DP2(src0, src1, pdst2, 2); | |||
| pdst2 += 4; | |||
| ST_DP2_INC(src0, src1, pdst2, 2); | |||
| } | |||
| if (n & 1) | |||
| @@ -137,8 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| psrc1 += 4; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| ST_DP2(src0, src1, pdst1, 2); | |||
| pdst1 += 4 * m; | |||
| @@ -62,27 +62,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4(psrc4, 2, src12, src13, src14, src15); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| psrc3 += 8; | |||
| psrc4 += 8; | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst1 + 16, 2); | |||
| LD_DP4(psrc5, 2, src0, src1, src2, src3); | |||
| LD_DP4(psrc6, 2, src4, src5, src6, src7); | |||
| LD_DP4(psrc7, 2, src8, src9, src10, src11); | |||
| LD_DP4(psrc8, 2, src12, src13, src14, src15); | |||
| psrc5 += 8; | |||
| psrc6 += 8; | |||
| psrc7 += 8; | |||
| psrc8 += 8; | |||
| LD_DP4_INC(psrc5, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc6, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc7, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc8, 2, src12, src13, src14, src15); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, | |||
| 2); | |||
| @@ -93,27 +85,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| if (n & 4) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| LD_DP2(psrc3, 2, src4, src5); | |||
| LD_DP2(psrc4, 2, src6, src7); | |||
| LD_DP2(psrc5, 2, src8, src9); | |||
| LD_DP2(psrc6, 2, src10, src11); | |||
| LD_DP2(psrc7, 2, src12, src13); | |||
| LD_DP2(psrc8, 2, src14, src15); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| psrc5 += 4; | |||
| psrc6 += 4; | |||
| psrc7 += 4; | |||
| psrc8 += 4; | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst2 + 16, 2); | |||
| pdst2 += 32; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||
| ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| @@ -135,8 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc7 += 2; | |||
| psrc8 += 2; | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); | |||
| pdst3 += 16; | |||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); | |||
| } | |||
| if (n & 1) | |||
| @@ -165,18 +147,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4(psrc4, 2, src12, src13, src14, src15); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| psrc3 += 8; | |||
| psrc4 += 8; | |||
| psrc5 += 8; | |||
| psrc6 += 8; | |||
| psrc7 += 8; | |||
| psrc8 += 8; | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| @@ -186,17 +160,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| if (n & 4) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| LD_DP2(psrc3, 2, src4, src5); | |||
| LD_DP2(psrc4, 2, src6, src7); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||
| pdst2 += 16; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| @@ -210,8 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| ST_DP4(src0, src1, src2, src3, pdst3, 2); | |||
| pdst3 += 8; | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); | |||
| } | |||
| if (n & 1) | |||
| @@ -234,10 +202,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4(psrc2, 2, src4, src5, src6, src7); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| pdst1 += 8 * m; | |||
| @@ -245,13 +211,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| if (n & 4) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| LD_DP2(psrc2, 2, src2, src3); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| ST_DP4(src0, src1, src2, src3, pdst2, 2); | |||
| pdst2 += 8; | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| @@ -261,8 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| ST_DP2(src0, src1, pdst3, 2); | |||
| pdst3 += 4; | |||
| ST_DP2_INC(src0, src1, pdst3, 2); | |||
| } | |||
| if (n & 1) | |||
| @@ -282,8 +244,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4(psrc1, 2, src0, src1, src2, src3); | |||
| psrc1 += 8; | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| ST_DP4(src0, src1, src2, src3, pdst1, 2); | |||
| pdst1 += 8 * m; | |||
| @@ -291,11 +252,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| if (n & 4) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| psrc1 += 4; | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| ST_DP2(src0, src1, pdst2, 2); | |||
| pdst2 += 4; | |||
| ST_DP2_INC(src0, src1, pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| @@ -42,10 +42,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) | |||
| #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) | |||
| #define COPY_FLOAT_TO_VECTOR(a, b) \ | |||
| b = __msa_cast_to_vector_float(a); \ | |||
| b = (v4f32) __msa_splati_w((v4i32) b, 0); | |||
| #define COPY_FLOAT_TO_VECTOR(a) ( { \ | |||
| v4f32 out; \ | |||
| out = __msa_cast_to_vector_float(a); \ | |||
| out = (v4f32) __msa_splati_w((v4i32) out, 0); \ | |||
| out; \ | |||
| } ) | |||
| #define COPY_DOUBLE_TO_VECTOR(a) ( { \ | |||
| v2f64 out; \ | |||
| out = __msa_cast_to_vector_double(a); \ | |||
| out = (v2f64) __msa_splati_d((v2i64) out, 0); \ | |||
| out; \ | |||
| } ) | |||
| /* Description : Load 2 variables with stride | |||
| Arguments : Inputs - psrc, stride | |||
| Outputs - out0, out1 | |||
| */ | |||
| #define LD_GP2_INC(psrc, stride, out0, out1) \ | |||
| { \ | |||
| out0 = *(psrc); \ | |||
| (psrc) += stride; \ | |||
| out1 = *(psrc); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_GP3_INC(psrc, stride, out0, \ | |||
| out1, out2) \ | |||
| { \ | |||
| LD_GP2_INC(psrc, stride, out0, out1); \ | |||
| out2 = *(psrc); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_GP4_INC(psrc, stride, out0, \ | |||
| out1, out2, out3) \ | |||
| { \ | |||
| LD_GP2_INC(psrc, stride, out0, out1); \ | |||
| LD_GP2_INC(psrc, stride, out2, out3); \ | |||
| } | |||
| #define LD_GP5_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, out4) \ | |||
| { \ | |||
| LD_GP2_INC(psrc, stride, out0, out1); \ | |||
| LD_GP2_INC(psrc, stride, out2, out3); \ | |||
| out4 = *(psrc); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_GP6_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, \ | |||
| out4, out5) \ | |||
| { \ | |||
| LD_GP2_INC(psrc, stride, out0, out1); \ | |||
| LD_GP2_INC(psrc, stride, out2, out3); \ | |||
| LD_GP2_INC(psrc, stride, out4, out5); \ | |||
| } | |||
| #define LD_GP7_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, \ | |||
| out4, out5, out6) \ | |||
| { \ | |||
| LD_GP2_INC(psrc, stride, out0, out1); \ | |||
| LD_GP2_INC(psrc, stride, out2, out3); \ | |||
| LD_GP2_INC(psrc, stride, out4, out5); \ | |||
| out6 = *(psrc); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_GP8_INC(psrc, stride, out0, out1, out2, \ | |||
| out3, out4, out5, out6, out7) \ | |||
| { \ | |||
| LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \ | |||
| LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \ | |||
| } | |||
| /* Description : Load 2 vectors of single precision floating point elements with stride | |||
| Arguments : Inputs - psrc, stride | |||
| @@ -58,6 +130,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| out1 = LD_SP((psrc) + stride); \ | |||
| } | |||
| #define LD_SP4(psrc, stride, out0, out1, out2, out3) \ | |||
| { \ | |||
| LD_SP2(psrc, stride, out0, out1) \ | |||
| LD_SP2(psrc + 2 * stride, stride, out2, out3) \ | |||
| } | |||
| #define LD_SP2_INC(psrc, stride, out0, out1) \ | |||
| { \ | |||
| out0 = LD_SP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| out1 = LD_SP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_SP3_INC(psrc, stride, out0, \ | |||
| out1, out2) \ | |||
| { \ | |||
| LD_SP2_INC(psrc, stride, out0, out1); \ | |||
| out2 = LD_SP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_SP4_INC(psrc, stride, out0, \ | |||
| out1, out2, out3) \ | |||
| { \ | |||
| LD_SP2_INC(psrc, stride, out0, out1); \ | |||
| LD_SP2_INC(psrc, stride, out2, out3); \ | |||
| } | |||
| #define LD_SP5_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, out4) \ | |||
| { \ | |||
| LD_SP2_INC(psrc, stride, out0, out1); \ | |||
| LD_SP2_INC(psrc, stride, out2, out3); \ | |||
| out4 = LD_SP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_SP6_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, \ | |||
| out4, out5) \ | |||
| { \ | |||
| LD_SP2_INC(psrc, stride, out0, out1); \ | |||
| LD_SP2_INC(psrc, stride, out2, out3); \ | |||
| LD_SP2_INC(psrc, stride, out4, out5); \ | |||
| } | |||
| #define LD_SP7_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, \ | |||
| out4, out5, out6) \ | |||
| { \ | |||
| LD_SP2_INC(psrc, stride, out0, out1); \ | |||
| LD_SP2_INC(psrc, stride, out2, out3); \ | |||
| LD_SP2_INC(psrc, stride, out4, out5); \ | |||
| out6 = LD_SP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_SP8_INC(psrc, stride, out0, out1, out2, \ | |||
| out3, out4, out5, out6, out7) \ | |||
| { \ | |||
| LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \ | |||
| LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \ | |||
| } | |||
| #define LD_SP16_INC(psrc, stride, out0, out1, out2, \ | |||
| out3, out4, out5, out6, out7, out8, \ | |||
| out9, out10, out11, out12, out13, \ | |||
| out14, out15) \ | |||
| { \ | |||
| LD_SP8_INC(psrc, stride, out0, out1, out2, \ | |||
| out3, out4, out5, out6, out7); \ | |||
| LD_SP8_INC(psrc, stride, out8, out9, out10, \ | |||
| out11, out12, out13, out14, out15); \ | |||
| } | |||
| /* Description : Load 2 vectors of double precision floating point elements with stride | |||
| Arguments : Inputs - psrc, stride | |||
| Outputs - out0, out1 | |||
| @@ -75,6 +223,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LD_DP2(psrc + 2 * stride, stride, out2, out3) \ | |||
| } | |||
| #define LD_DP2_INC(psrc, stride, out0, out1) \ | |||
| { \ | |||
| out0 = LD_DP(psrc); \ | |||
| (psrc) += stride; \ | |||
| out1 = LD_DP(psrc); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_DP3_INC(psrc, stride, out0, \ | |||
| out1, out2) \ | |||
| { \ | |||
| LD_DP2_INC(psrc, stride, out0, out1); \ | |||
| out2 = LD_DP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_DP4_INC(psrc, stride, out0, \ | |||
| out1, out2, out3) \ | |||
| { \ | |||
| LD_DP2_INC(psrc, stride, out0, out1); \ | |||
| LD_DP2_INC(psrc, stride, out2, out3); \ | |||
| } | |||
| #define LD_DP5_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, out4) \ | |||
| { \ | |||
| LD_DP2_INC(psrc, stride, out0, out1); \ | |||
| LD_DP2_INC(psrc, stride, out2, out3); \ | |||
| out4 = LD_DP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_DP6_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, \ | |||
| out4, out5) \ | |||
| { \ | |||
| LD_DP2_INC(psrc, stride, out0, out1); \ | |||
| LD_DP2_INC(psrc, stride, out2, out3); \ | |||
| LD_DP2_INC(psrc, stride, out4, out5); \ | |||
| } | |||
| #define LD_DP7_INC(psrc, stride, out0, \ | |||
| out1, out2, out3, \ | |||
| out4, out5, out6) \ | |||
| { \ | |||
| LD_DP2_INC(psrc, stride, out0, out1); \ | |||
| LD_DP2_INC(psrc, stride, out2, out3); \ | |||
| LD_DP2_INC(psrc, stride, out4, out5); \ | |||
| out6 = LD_DP((psrc)); \ | |||
| (psrc) += stride; \ | |||
| } | |||
| #define LD_DP8_INC(psrc, stride, out0, out1, out2, \ | |||
| out3, out4, out5, out6, out7) \ | |||
| { \ | |||
| LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \ | |||
| LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \ | |||
| } | |||
| #define LD_DP16_INC(psrc, stride, out0, out1, out2, \ | |||
| out3, out4, out5, out6, out7, out8, \ | |||
| out9, out10, out11, out12, out13, \ | |||
| out14, out15) \ | |||
| { \ | |||
| LD_DP8_INC(psrc, stride, out0, out1, out2, \ | |||
| out3, out4, out5, out6, out7); \ | |||
| LD_DP8_INC(psrc, stride, out8, out9, out10, \ | |||
| out11, out12, out13, out14, out15); \ | |||
| } | |||
| /* Description : Store GP variable with stride | |||
| Arguments : Inputs - in0, in1, pdst, stride | |||
| Details : Store 4 single precision floating point elements from 'in0' to (pdst) | |||
| Store 4 single precision floating point elements from 'in1' to (pdst + stride) | |||
| */ | |||
| #define ST_GP2_INC(in0, in1, \ | |||
| pdst, stride) \ | |||
| { \ | |||
| *(pdst) = in0; \ | |||
| (pdst) += stride; \ | |||
| *(pdst) = in1; \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_GP3_INC(in0, in1, in2, \ | |||
| pdst, stride) \ | |||
| { \ | |||
| ST_GP2_INC(in0, in1, pdst, stride); \ | |||
| *(pdst) = in2; \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_GP4_INC(in0, in1, in2, in3, \ | |||
| pdst, stride) \ | |||
| { \ | |||
| ST_GP2_INC(in0, in1, pdst, stride); \ | |||
| ST_GP2_INC(in2, in3, pdst, stride); \ | |||
| } | |||
| #define ST_GP5_INC(in0, in1, in2, in3, \ | |||
| in4, pdst, stride) \ | |||
| { \ | |||
| ST_GP2_INC(in0, in1, pdst, stride); \ | |||
| ST_GP2_INC(in2, in3, pdst, stride); \ | |||
| *(pdst) = in4; \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_GP6_INC(in0, in1, in2, in3, \ | |||
| in4, in5, pdst, stride) \ | |||
| { \ | |||
| ST_GP2_INC(in0, in1, pdst, stride); \ | |||
| ST_GP2_INC(in2, in3, pdst, stride); \ | |||
| ST_GP2_INC(in4, in5, pdst, stride); \ | |||
| } | |||
| #define ST_GP7_INC(in0, in1, in2, in3, in4, \ | |||
| in5, in6, pdst, stride) \ | |||
| { \ | |||
| ST_GP2_INC(in0, in1, pdst, stride); \ | |||
| ST_GP2_INC(in2, in3, pdst, stride); \ | |||
| ST_GP2_INC(in4, in5, pdst, stride); \ | |||
| *(pdst) = in6; \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \ | |||
| in6, in7, pdst, stride) \ | |||
| { \ | |||
| ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \ | |||
| ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \ | |||
| } | |||
| /* Description : Store vectors of single precision floating point elements with stride | |||
| Arguments : Inputs - in0, in1, pdst, stride | |||
| Details : Store 4 single precision floating point elements from 'in0' to (pdst) | |||
| @@ -98,6 +379,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \ | |||
| } | |||
| #define ST_SP2_INC(in0, in1, pdst, stride) \ | |||
| { \ | |||
| ST_SP(in0, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| ST_SP(in1, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_SP3_INC(in0, in1, in2, \ | |||
| pdst, stride) \ | |||
| { \ | |||
| ST_SP2_INC(in0, in1, pdst, stride); \ | |||
| ST_SP(in2, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_SP4_INC(in0, in1, in2, in3, \ | |||
| pdst, stride) \ | |||
| { \ | |||
| ST_SP2_INC(in0, in1, pdst, stride); \ | |||
| ST_SP2_INC(in2, in3, pdst, stride); \ | |||
| } | |||
| #define ST_SP5_INC(in0, in1, in2, in3, \ | |||
| in4, pdst, stride) \ | |||
| { \ | |||
| ST_SP2_INC(in0, in1, pdst, stride); \ | |||
| ST_SP2_INC(in2, in3, pdst, stride); \ | |||
| ST_SP(in4, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_SP6_INC(in0, in1, in2, in3, \ | |||
| in4, in5, pdst, stride) \ | |||
| { \ | |||
| ST_SP2_INC(in0, in1, pdst, stride); \ | |||
| ST_SP2_INC(in2, in3, pdst, stride); \ | |||
| ST_SP2_INC(in4, in5, pdst, stride); \ | |||
| } | |||
| #define ST_SP7_INC(in0, in1, in2, in3, in4, \ | |||
| in5, in6, pdst, stride) \ | |||
| { \ | |||
| ST_SP2_INC(in0, in1, pdst, stride); \ | |||
| ST_SP2_INC(in2, in3, pdst, stride); \ | |||
| ST_SP2_INC(in4, in5, pdst, stride); \ | |||
| ST_SP(in6, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \ | |||
| in6, in7, pdst, stride) \ | |||
| { \ | |||
| ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \ | |||
| ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \ | |||
| } | |||
| #define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \ | |||
| in7, in8, in9, in10, in11, in12, \ | |||
| in13, in14, in15, pdst, stride) \ | |||
| { \ | |||
| ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \ | |||
| in7, pdst, stride); \ | |||
| ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \ | |||
| in15, pdst, stride); \ | |||
| } | |||
| /* Description : Store vectors of double precision floating point elements with stride | |||
| Arguments : Inputs - in0, in1, pdst, stride | |||
| Details : Store 2 double precision floating point elements from 'in0' to (pdst) | |||
| @@ -121,6 +469,104 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ | |||
| } | |||
| #define ST_DP2_INC(in0, in1, pdst, stride) \ | |||
| { \ | |||
| ST_DP(in0, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| ST_DP(in1, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_DP3_INC(in0, in1, in2, \ | |||
| pdst, stride) \ | |||
| { \ | |||
| ST_DP2_INC(in0, in1, pdst, stride); \ | |||
| ST_DP(in2, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_DP4_INC(in0, in1, in2, in3, \ | |||
| pdst, stride) \ | |||
| { \ | |||
| ST_DP2_INC(in0, in1, pdst, stride); \ | |||
| ST_DP2_INC(in2, in3, pdst, stride); \ | |||
| } | |||
| #define ST_DP5_INC(in0, in1, in2, in3, \ | |||
| in4, pdst, stride) \ | |||
| { \ | |||
| ST_DP2_INC(in0, in1, pdst, stride); \ | |||
| ST_DP2_INC(in2, in3, pdst, stride); \ | |||
| ST_DP(in4, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_DP6_INC(in0, in1, in2, in3, \ | |||
| in4, in5, pdst, stride) \ | |||
| { \ | |||
| ST_DP2_INC(in0, in1, pdst, stride); \ | |||
| ST_DP2_INC(in2, in3, pdst, stride); \ | |||
| ST_DP2_INC(in4, in5, pdst, stride); \ | |||
| } | |||
| #define ST_DP7_INC(in0, in1, in2, in3, in4, \ | |||
| in5, in6, pdst, stride) \ | |||
| { \ | |||
| ST_DP2_INC(in0, in1, pdst, stride); \ | |||
| ST_DP2_INC(in2, in3, pdst, stride); \ | |||
| ST_DP2_INC(in4, in5, pdst, stride); \ | |||
| ST_DP(in6, (pdst)); \ | |||
| (pdst) += stride; \ | |||
| } | |||
| #define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \ | |||
| in6, in7, pdst, stride) \ | |||
| { \ | |||
| ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \ | |||
| ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \ | |||
| } | |||
| #define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \ | |||
| in7, in8, in9, in10, in11, in12, \ | |||
| in13, in14, in15, pdst, stride) \ | |||
| { \ | |||
| ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \ | |||
| in7, pdst, stride); \ | |||
| ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \ | |||
| in15, pdst, stride); \ | |||
| } | |||
| /* Description : shuffle elements in vector as shf_val | |||
| Arguments : Inputs - in0, in1 | |||
| Outputs - out0, out1 | |||
| Return Type - as per RTYPE | |||
| */ | |||
| #define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ | |||
| out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ | |||
| } | |||
| #define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__) | |||
| #define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__) | |||
| #define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \ | |||
| shf_val) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ | |||
| out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ | |||
| out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \ | |||
| } | |||
| #define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__) | |||
| #define SHF_W4(RTYPE, in0, in1, in2, in3, \ | |||
| out0, out1, out2, out3, shf_val) \ | |||
| { \ | |||
| SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \ | |||
| SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \ | |||
| } | |||
| #define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__) | |||
| #define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__) | |||
| /* Description : Interleave both left and right half of input vectors | |||
| Arguments : Inputs - in0, in1 | |||
| Outputs - out0, out1 | |||
| @@ -134,12 +580,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ | |||
| } | |||
| #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) | |||
| #define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__) | |||
| #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ | |||
| out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ | |||
| } | |||
| #define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__) | |||
| #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) | |||
| /* Description : Indexed word element values are replicated to all | |||
| @@ -158,6 +606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ | |||
| out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ | |||
| } | |||
| #define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__) | |||
| #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ | |||
| { \ | |||
| @@ -166,22 +615,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| } | |||
| #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) | |||
| #define SPLATI_D2(RTYPE, in, out0, out1) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \ | |||
| out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \ | |||
| } | |||
| #define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__) | |||
| /* Description : Pack even double word elements of vector pairs | |||
| Arguments : Inputs - in0, in1, in2, in3 | |||
| Outputs - out0, out1 | |||
| Return Type - as per RTYPE | |||
| Details : Even double word elements of 'in0' are copied to the left half | |||
| of 'out0' & even double word elements of 'in1' are copied to | |||
| the right half of 'out0'. | |||
| */ | |||
| #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ | |||
| out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ | |||
| } | |||
| #define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__) | |||
| #define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__) | |||
| #define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \ | |||
| out0, out1, out2) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ | |||
| out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ | |||
| out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \ | |||
| } | |||
| #define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__) | |||
| #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ | |||
| out0, out1, out2, out3) \ | |||
| { \ | |||
| PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ | |||
| PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ | |||
| } | |||
| #define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__) | |||
| /* Description : pack both even and odd half of input vectors | |||
| Arguments : Inputs - in0, in1 | |||
| Outputs - out0, out1 | |||
| Return Type - as per RTYPE | |||
| Details : Even double word elements of 'in0' and 'in1' are copied to the | |||
| 'out0' & odd double word elements of 'in0' and 'in1' are | |||
| copied to the 'out1'. | |||
| */ | |||
| #define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \ | |||
| out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \ | |||
| } | |||
| #define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__) | |||
| #define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \ | |||
| { \ | |||
| out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ | |||
| out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ | |||
| } | |||
| #define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__) | |||
| /* Description : Multiplication of pairs of vectors | |||
| Arguments : Inputs - in0, in1, in2, in3 | |||
| Outputs - out0, out1 | |||
| Details : Each element from 'in0' is multiplied with elements from 'in1' | |||
| and the result is written to 'out0' | |||
| */ | |||
| #define MUL2(in0, in1, in2, in3, out0, out1) \ | |||
| { \ | |||
| out0 = in0 * in1; \ | |||
| out1 = in2 * in3; \ | |||
| } | |||
| #define MUL3(in0, in1, in2, in3, in4, in5, \ | |||
| out0, out1, out2) \ | |||
| { \ | |||
| out0 = in0 * in1; \ | |||
| out1 = in2 * in3; \ | |||
| out2 = in4 * in5; \ | |||
| } | |||
| #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ | |||
| out0, out1, out2, out3) \ | |||
| { \ | |||
| MUL2(in0, in1, in2, in3, out0, out1); \ | |||
| MUL2(in4, in5, in6, in7, out2, out3); \ | |||
| } | |||
| /* Description : Addition of 2 pairs of variables | |||
| Arguments : Inputs - in0, in1, in2, in3 | |||
| Outputs - out0, out1 | |||
| Details : Each element in 'in0' is added to 'in1' and result is written | |||
| to 'out0'. | |||
| */ | |||
| #define ADD2(in0, in1, in2, in3, out0, out1) \ | |||
| { \ | |||
| out0 = in0 + in1; \ | |||
| out1 = in2 + in3; \ | |||
| } | |||
| #define ADD3(in0, in1, in2, in3, in4, in5, \ | |||
| out0, out1, out2) \ | |||
| { \ | |||
| out0 = in0 + in1; \ | |||
| out1 = in2 + in3; \ | |||
| out2 = in4 + in5; \ | |||
| } | |||
| #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ | |||
| out0, out1, out2, out3) \ | |||
| { \ | |||
| ADD2(in0, in1, in2, in3, out0, out1); \ | |||
| ADD2(in4, in5, in6, in7, out2, out3); \ | |||
| } | |||
| /* Description : Transpose 4x4 block with word elements in vectors | |||
| Arguments : Inputs - in0, in1, in2, in3 | |||
| Outputs - out0, out1, out2, out3 | |||
| Return Type - as per RTYPE | |||
| */ | |||
| #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ | |||
| { \ | |||
| v4i32 s0_m, s1_m, s2_m, s3_m; \ | |||
| \ | |||
| ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ | |||
| ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ | |||
| \ | |||
| out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \ | |||
| out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \ | |||
| out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ | |||
| out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ | |||
| #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \ | |||
| out0, out1, out2, out3) \ | |||
| { \ | |||
| v4i32 s0_m, s1_m, s2_m, s3_m; \ | |||
| \ | |||
| ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ | |||
| ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ | |||
| ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \ | |||
| ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \ | |||
| } | |||
| #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) | |||
| @@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0; | |||
| FLOAT *psrc1, *psrc2, *psrc3, *psrc4; | |||
| FLOAT *psrc5, *psrc6, *psrc7, *psrc8; | |||
| FLOAT *pdst; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||
| FLOAT *psrc8, *pdst; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||
| @@ -58,22 +55,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (m >> 3); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| LD_SP2(psrc3, 4, src4, src5); | |||
| LD_SP2(psrc4, 4, src6, src7); | |||
| LD_SP2(psrc5, 4, src8, src9); | |||
| LD_SP2(psrc6, 4, src10, src11); | |||
| LD_SP2(psrc7, 4, src12, src13); | |||
| LD_SP2(psrc8, 4, src14, src15); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| psrc3 += 8; | |||
| psrc4 += 8; | |||
| psrc5 += 8; | |||
| psrc6 += 8; | |||
| psrc7 += 8; | |||
| psrc8 += 8; | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| LD_SP2_INC(psrc5, 4, src8, src9); | |||
| LD_SP2_INC(psrc6, 4, src10, src11); | |||
| LD_SP2_INC(psrc7, 4, src12, src13); | |||
| LD_SP2_INC(psrc8, 4, src14, src15); | |||
| TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6); | |||
| TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5, | |||
| @@ -83,15 +72,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13, | |||
| dst15); | |||
| ST_SP2(dst0, dst1, pdst, 4); | |||
| ST_SP2(dst2, dst3, pdst + 8, 4); | |||
| ST_SP2(dst4, dst5, pdst + 16, 4); | |||
| ST_SP2(dst6, dst7, pdst + 24, 4); | |||
| ST_SP2(dst8, dst9, pdst + 32, 4); | |||
| ST_SP2(dst10, dst11, pdst + 40, 4); | |||
| ST_SP2(dst12, dst13, pdst + 48, 4); | |||
| ST_SP2(dst14, dst15, pdst + 56, 4); | |||
| pdst += 64; | |||
| ST_SP2_INC(dst0, dst1, pdst, 4); | |||
| ST_SP2_INC(dst2, dst3, pdst, 4); | |||
| ST_SP2_INC(dst4, dst5, pdst, 4); | |||
| ST_SP2_INC(dst6, dst7, pdst, 4); | |||
| ST_SP2_INC(dst8, dst9, pdst, 4); | |||
| ST_SP2_INC(dst10, dst11, pdst, 4); | |||
| ST_SP2_INC(dst12, dst13, pdst, 4); | |||
| ST_SP2_INC(dst14, dst15, pdst, 4); | |||
| } | |||
| for (i = (m & 7); i--;) | |||
| @@ -128,9 +116,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3); | |||
| ST_SP2(dst0, dst1, pdst, 4); | |||
| ST_SP2(dst2, dst3, pdst + 8, 4); | |||
| pdst += 16; | |||
| ST_SP2_INC(dst0, dst1, pdst, 4); | |||
| ST_SP2_INC(dst2, dst3, pdst, 4); | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| @@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0; | |||
| FLOAT *psrc1, *psrc2, *psrc3, *psrc4; | |||
| FLOAT *psrc5, *psrc6, *psrc7, *psrc8; | |||
| FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||
| FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| @@ -63,22 +60,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| LD_SP2(psrc3, 4, src4, src5); | |||
| LD_SP2(psrc4, 4, src6, src7); | |||
| LD_SP2(psrc5, 4, src8, src9); | |||
| LD_SP2(psrc6, 4, src10, src11); | |||
| LD_SP2(psrc7, 4, src12, src13); | |||
| LD_SP2(psrc8, 4, src14, src15); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| psrc3 += 8; | |||
| psrc4 += 8; | |||
| psrc5 += 8; | |||
| psrc6 += 8; | |||
| psrc7 += 8; | |||
| psrc8 += 8; | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| LD_SP2_INC(psrc5, 4, src8, src9); | |||
| LD_SP2_INC(psrc6, 4, src10, src11); | |||
| LD_SP2_INC(psrc7, 4, src12, src13); | |||
| LD_SP2_INC(psrc8, 4, src14, src15); | |||
| ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); | |||
| ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| @@ -105,8 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc7 += 4; | |||
| psrc8 += 4; | |||
| ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); | |||
| pdst2 += 32; | |||
| ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); | |||
| } | |||
| if (n & 2) | |||
| @@ -155,14 +143,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| LD_SP2(psrc3, 4, src4, src5); | |||
| LD_SP2(psrc4, 4, src6, src7); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| psrc3 += 8; | |||
| psrc4 += 8; | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); | |||
| pdst1 += 8 * m; | |||
| @@ -179,8 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| ST_SP4(src0, src1, src2, src3, pdst2, 4); | |||
| pdst2 += 16; | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst2, 4); | |||
| } | |||
| if (n & 2) | |||
| @@ -215,10 +198,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| psrc1 += 8; | |||
| psrc2 += 8; | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| ST_SP4(src0, src1, src2, src3, pdst1, 4); | |||
| pdst1 += 8 * m; | |||
| @@ -231,8 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| ST_SP2(src0, src1, pdst2, 4); | |||
| pdst2 += 8; | |||
| ST_SP2_INC(src0, src1, pdst2, 4); | |||
| } | |||
| if (n & 2) | |||
| @@ -260,8 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| psrc1 += 8; | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| ST_SP2(src0, src1, pdst1, 4); | |||
| pdst1 += 8 * m; | |||
| @@ -288,5 +267,5 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| } | |||
| } | |||
| return 0; | |||
| return 0; | |||
| } | |||
| @@ -166,7 +166,7 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a = LD_SP(a + 32); | |||
| SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); | |||
| src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); | |||
| res_c4 *= src_a36; | |||
| res_c12 *= src_a36; | |||
| @@ -220,9 +220,9 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c0 -= res_c2 * src_a16; | |||
| res_c8 -= res_c10 * src_a16; | |||
| COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); | |||
| src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); | |||
| src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); | |||
| src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); | |||
| res_c1 *= src_a9; | |||
| res_c9 *= src_a9; | |||
| @@ -306,7 +306,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| bb += 4; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| @@ -374,7 +374,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a = LD_SP(a + 32); | |||
| SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); | |||
| src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); | |||
| res_c4 *= src_a36; | |||
| res_c3 -= res_c4 * src_a35; | |||
| @@ -399,9 +399,9 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c1 -= res_c2 * src_a17; | |||
| res_c0 -= res_c2 * src_a16; | |||
| COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); | |||
| src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); | |||
| src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); | |||
| src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); | |||
| res_c1 *= src_a9; | |||
| res_c0 -= res_c1 * src_a8; | |||
| @@ -826,9 +826,9 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); | |||
| src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); | |||
| src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); | |||
| src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); | |||
| src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); | |||
| res_c3 *= src_a15; | |||
| res_c7 *= src_a15; | |||
| @@ -916,7 +916,7 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| bb += 4; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| src_a0 = LD_SP(aa); | |||
| @@ -940,9 +940,9 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); | |||
| src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); | |||
| src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); | |||
| src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); | |||
| src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); | |||
| src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); | |||
| res_c3 *= src_a15; | |||
| res_c2 -= res_c3 * src_a14; | |||
| @@ -162,7 +162,7 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a = LD_SP(a + 27); | |||
| SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); | |||
| src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); | |||
| res_c3 *= src_a27; | |||
| res_c11 *= src_a27; | |||
| @@ -216,9 +216,9 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c7 -= res_c5 * src_a47; | |||
| res_c15 -= res_c13 * src_a47; | |||
| COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); | |||
| src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); | |||
| src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); | |||
| src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); | |||
| res_c6 *= src_a54; | |||
| res_c14 *= src_a54; | |||
| @@ -334,7 +334,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a = LD_SP(a + 27); | |||
| SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); | |||
| src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); | |||
| res_c3 *= src_a27; | |||
| res_c4 -= res_c3 * src_a28; | |||
| @@ -359,9 +359,9 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| res_c6 -= res_c5 * src_a46; | |||
| res_c7 -= res_c5 * src_a47; | |||
| COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); | |||
| src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); | |||
| src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); | |||
| src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); | |||
| res_c6 *= src_a54; | |||
| res_c7 -= res_c6 * src_a55; | |||
| @@ -780,7 +780,7 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 8; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| src_a0 = LD_SP(a); | |||
| @@ -813,9 +813,9 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); | |||
| src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); | |||
| src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); | |||
| src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); | |||
| src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); | |||
| src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); | |||
| res_c0 *= src_a0; | |||
| res_c4 *= src_a0; | |||
| @@ -902,7 +902,7 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 4; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| src_a0 = LD_SP(a); | |||
| @@ -926,9 +926,9 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); | |||
| src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); | |||
| src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); | |||
| COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); | |||
| src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); | |||
| src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); | |||
| src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); | |||
| res_c0 *= src_a0; | |||
| res_c1 -= res_c0 * src_a1; | |||
| @@ -144,7 +144,7 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b = LD_SP(b + 27); | |||
| SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); | |||
| src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); | |||
| src_c4 *= src_b18; | |||
| src_c5 *= src_b18; | |||
| @@ -184,9 +184,9 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); | |||
| src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); | |||
| src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); | |||
| src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); | |||
| src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); | |||
| src_c8 *= src_b36; | |||
| src_c9 *= src_b36; | |||
| @@ -275,7 +275,7 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 4; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| @@ -300,9 +300,9 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); | |||
| src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); | |||
| src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); | |||
| src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); | |||
| src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); | |||
| src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -351,8 +351,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| { | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -364,8 +364,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -376,12 +376,12 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 2; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -392,9 +392,9 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 2; | |||
| } | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); | |||
| src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -419,7 +419,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| { | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -429,7 +429,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -439,7 +439,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -449,7 +449,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -458,13 +458,13 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 1; | |||
| } | |||
| if (bk & 3) | |||
| if ((bk & 3) && (bk > 0)) | |||
| { | |||
| if (bk & 2) | |||
| { | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -474,7 +474,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -487,7 +487,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| { | |||
| LD_SP2(a, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -497,7 +497,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| } | |||
| } | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -574,7 +574,7 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b = LD_SP(b + 27); | |||
| SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); | |||
| src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); | |||
| src_b = LD_SP(b + 36); | |||
| SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); | |||
| @@ -584,9 +584,9 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); | |||
| src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); | |||
| src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); | |||
| src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); | |||
| src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); | |||
| src_c0 *= src_b0; | |||
| src_c1 -= src_c0 * src_b1; | |||
| @@ -686,7 +686,7 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 4; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| src_a0 = LD_SP(a); | |||
| @@ -707,9 +707,9 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); | |||
| src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); | |||
| src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); | |||
| src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); | |||
| src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); | |||
| src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); | |||
| src_c0 *= src_b0; | |||
| src_c1 -= src_c0 * src_b1; | |||
| @@ -789,7 +789,7 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| b += 2; | |||
| } | |||
| if (bk & 3) | |||
| if ((bk & 3) && (bk > 0)) | |||
| { | |||
| if (bk & 2) | |||
| { | |||
| @@ -831,9 +831,9 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| } | |||
| } | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); | |||
| src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); | |||
| src_c0 *= src_b0; | |||
| src_c1 -= src_c0 * src_b1; | |||
| @@ -158,7 +158,7 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b = LD_SP(b + 32); | |||
| SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); | |||
| src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); | |||
| src_c8 *= src_b36; | |||
| src_c9 *= src_b36; | |||
| @@ -203,9 +203,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| ST_SP2(src_c4, src_c5, c_nxt2line, 4); | |||
| ST_SP2(src_c6, src_c7, c_nxt3line, 4); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); | |||
| src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c2 *= src_b9; | |||
| src_c3 *= src_b9; | |||
| @@ -273,7 +273,7 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| bb += 4; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| @@ -298,9 +298,9 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); | |||
| src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); | |||
| src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); | |||
| src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c7 *= src_b15; | |||
| src_c6 *= src_b15; | |||
| @@ -350,8 +350,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| { | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -363,8 +363,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -375,12 +375,12 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| bb += 2; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -391,9 +391,9 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| a -= 16; | |||
| b -= 4; | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); | |||
| src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); | |||
| src_c2 *= src_b3; | |||
| src_c3 *= src_b3; | |||
| @@ -419,7 +419,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| { | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -429,7 +429,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -439,7 +439,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -449,7 +449,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -458,13 +458,13 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| bb += 1; | |||
| } | |||
| if (bk & 3) | |||
| if ((bk & 3) && (bk > 0)) | |||
| { | |||
| if (bk & 2) | |||
| { | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -474,7 +474,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -487,7 +487,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| { | |||
| LD_SP2(aa, 4, src_a0, src_a1); | |||
| COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); | |||
| src_c0 -= src_a0 * src_b0; | |||
| src_c1 -= src_a1 * src_b0; | |||
| @@ -497,7 +497,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) | |||
| a -= 8; | |||
| b -= 1; | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c0 *= src_b0; | |||
| src_c1 *= src_b0; | |||
| @@ -579,7 +579,7 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b = LD_SP(b + 32); | |||
| SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); | |||
| src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); | |||
| src_b = LD_SP(b + 24); | |||
| SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); | |||
| @@ -589,9 +589,9 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); | |||
| src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); | |||
| src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c7 *= src_b63; | |||
| src_c6 -= src_c7 * src_b62; | |||
| @@ -695,7 +695,7 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| bb += 4; | |||
| } | |||
| if (bk & 1) | |||
| if ((bk & 1) && (bk > 0)) | |||
| { | |||
| src_a = LD_SP(aa); | |||
| @@ -717,9 +717,9 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); | |||
| src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); | |||
| src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); | |||
| src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c3 *= src_b15; | |||
| src_c2 -= src_c3 * src_b14; | |||
| @@ -800,7 +800,7 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| bb += 2; | |||
| } | |||
| if (bk & 3) | |||
| if ((bk & 3) && (bk > 0)) | |||
| { | |||
| if (bk & 2) | |||
| { | |||
| @@ -842,9 +842,9 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO | |||
| a -= 8; | |||
| b -= 4; | |||
| COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); | |||
| COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); | |||
| src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); | |||
| src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); | |||
| src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); | |||
| src_c1 *= src_b3; | |||
| src_c0 -= src_c1 * src_b2; | |||
| @@ -0,0 +1,144 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| psrc0 = src; | |||
| pdst = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||
| ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); | |||
| ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15, | |||
| pdst, 2); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src4, src5); | |||
| LD_DP2_INC(psrc3, 2, src8, src9); | |||
| LD_DP2_INC(psrc4, 2, src12, src13); | |||
| ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src4 = LD_DP(psrc2); | |||
| src8 = LD_DP(psrc3); | |||
| src12 = LD_DP(psrc4); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| ST_DP4_INC(src0, src4, src8, src12, pdst, 2); | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src4, src5); | |||
| ST_DP4_INC(src0, src4, src1, src5, pdst, 2); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src4 = LD_DP(psrc2); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| ST_DP2_INC(src0, src4, pdst, 2); | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst, 2); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| ST_DP2_INC(src0, src1, pdst, 2); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| ST_DP(src0, pdst); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,161 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; | |||
| FLOAT *pdst0, *pdst1, *pdst2, *pdst3; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| psrc0 = src; | |||
| pdst0 = dst; | |||
| lda *= 2; | |||
| pdst2 = dst + 2 * m * (n & ~3); | |||
| pdst3 = dst + 2 * m * (n & ~1); | |||
| for (j = (m >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 32; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst1 + 16, 2); | |||
| pdst1 += m * 8; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| src2 = LD_DP(psrc3); | |||
| src3 = LD_DP(psrc4); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); | |||
| } | |||
| } | |||
| if (m & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 16; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| pdst1 += m * 8; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| ST_DP2_INC(src0, src1, pdst3, 2); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| } | |||
| } | |||
| if (m & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| pdst1 = pdst0; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| ST_DP4(src0, src1, src2, src3, pdst1, 2); | |||
| pdst1 += m * 8; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| ST_DP2_INC(src0, src1, pdst2, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| ST_DP(src0, pdst3); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -2188,11 +2188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SGEMM_DEFAULT_P 128 | |||
| #define DGEMM_DEFAULT_P 128 | |||
| @@ -2227,11 +2227,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SGEMM_DEFAULT_P 128 | |||
| #define DGEMM_DEFAULT_P 128 | |||