/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { BLASLONG i, j, l; FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; FLOAT *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; FLOAT a0, a1; FLOAT b0, b1, b2, b3, b4, b5, b6, b7; v4f32 v_alpha = {alpha, alpha, alpha, alpha}; v4f32 src_a0, src_a1, src_b, src_b0, src_b1; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v4f32 res0, res1, res2, res3, res4, res5, res6, res7; v4f32 res8, res9, res10, res11, res12, res13, res14, res15; for (j = (n / 8); j--;) { pc0 = C; pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; pc4 = pc3 + ldc; pc5 = pc4 + ldc; pc6 = pc5 + ldc; pc7 = pc6 + ldc; pa0 = A; for (i = (m / 8); i--;) { pb0 = B; LD_SP2(pa0, 4, src_a0, src_a1); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 = src_a0 * src_b; res3 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 = src_a0 * src_b; res5 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 = src_a0 * src_b; res7 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 = src_a0 * src_b; res9 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 = src_a0 * src_b; res11 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 = src_a0 * src_b; res13 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 = src_a0 * src_b; res15 = src_a1 * src_b; pa0 += 8; pb0 += 8; for (l = ((k - 1) / 2); l--;) { LD_SP2(pa0, 4, src_a0, src_a1); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 += src_a0 * src_b; res9 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 += src_a0 * src_b; res11 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 += src_a0 * src_b; res13 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; pa0 += 8; pb0 += 8; LD_SP2(pa0, 4, src_a0, src_a1); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 += src_a0 * src_b; res9 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 += src_a0 * src_b; res11 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 += src_a0 * src_b; res13 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; pa0 += 8; pb0 += 8; } if ((k - 1) & 1) { LD_SP2(pa0, 4, src_a0, src_a1); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 += src_a0 * src_b; res9 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 += src_a0 * src_b; res11 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 += src_a0 * src_b; res13 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; pa0 += 8; pb0 += 8; } LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); LD_SP2(pc3, 4, dst6, dst7); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; dst4 += res4 * v_alpha; dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; ST_SP2(dst0, dst1, pc0, 4); ST_SP2(dst2, dst3, pc1, 4); ST_SP2(dst4, dst5, pc2, 4); ST_SP2(dst6, dst7, pc3, 4); LD_SP2(pc4, 4, dst0, dst1); LD_SP2(pc5, 4, dst2, dst3); LD_SP2(pc6, 4, dst4, dst5); LD_SP2(pc7, 4, dst6, dst7); dst0 += res8 * v_alpha; dst1 += res9 * v_alpha; dst2 += res10 * v_alpha; dst3 += res11 * v_alpha; dst4 += res12 * v_alpha; dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; ST_SP2(dst0, dst1, pc4, 4); ST_SP2(dst2, dst3, pc5, 4); ST_SP2(dst4, dst5, pc6, 4); ST_SP2(dst6, dst7, pc7, 4); pc0 += 8; pc1 += 8; pc2 += 8; pc3 += 8; pc4 += 8; pc5 += 8; pc6 += 8; pc7 += 8; } for (i = ((m & 4) / 4); i--;) { pb0 = B; src_a0 = LD_SP(pa0); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 = src_a0 * src_b; pa0 += 4; pb0 += 8; for (l = ((k - 1) / 2); l--;) { src_a0 = LD_SP(pa0); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 += src_a0 * src_b; pa0 += 4; pb0 += 8; src_a0 = LD_SP(pa0); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 += src_a0 * src_b; pa0 += 4; pb0 += 8; } if ((k - 1) & 1) { src_a0 = LD_SP(pa0); LD_SP2(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 += src_a0 * src_b; pa0 += 4; pb0 += 8; } dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); dst3 = LD_SP(pc3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); dst0 = LD_SP(pc4); dst1 = LD_SP(pc5); dst2 = LD_SP(pc6); dst3 = LD_SP(pc7); dst0 += res4 * v_alpha; dst1 += res5 * v_alpha; dst2 += res6 * v_alpha; dst3 += res7 * v_alpha; ST_SP(dst0, pc4); ST_SP(dst1, pc5); ST_SP(dst2, pc6); ST_SP(dst3, pc7); pc0 += 4; pc1 += 4; pc2 += 4; pc3 += 4; pc4 += 4; pc5 += 4; pc6 += 4; pc7 += 4; } for (i = ((m & 2) / 2); i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; b1 = pb0[1]; tmp2 = a0 * b1; tmp3 = a1 * b1; b2 = pb0[2]; tmp4 = a0 * b2; tmp5 = a1 * b2; b3 = pb0[3]; tmp6 = a0 * b3; tmp7 = a1 * b3; b4 = pb0[4]; tmp8 = a0 * b4; tmp9 = a1 * b4; b5 = pb0[5]; tmp10 = a0 * b5; tmp11 = a1 * b5; b6 = pb0[6]; tmp12 = a0 * b6; tmp13 = a1 * b6; b7 = pb0[7]; tmp14 = a0 * b7; tmp15 = a1 * b7; pa0 += 2; pb0 += 8; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; b4 = pb0[4]; tmp8 += a0 * b4; tmp9 += a1 * b4; b5 = pb0[5]; tmp10 += a0 * b5; tmp11 += a1 * b5; b6 = pb0[6]; tmp12 += a0 * b6; tmp13 += a1 * b6; b7 = pb0[7]; tmp14 += a0 * b7; tmp15 += a1 * b7; pa0 += 2; pb0 += 8; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; b4 = pb0[4]; tmp8 += a0 * b4; tmp9 += a1 * b4; b5 = pb0[5]; tmp10 += a0 * b5; tmp11 += a1 * b5; b6 = pb0[6]; tmp12 += a0 * b6; tmp13 += a1 * b6; b7 = pb0[7]; tmp14 += a0 * b7; tmp15 += a1 * b7; pa0 += 2; pb0 += 8; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; b4 = pb0[4]; tmp8 += a0 * b4; tmp9 += a1 * b4; b5 = pb0[5]; tmp10 += a0 * b5; tmp11 += a1 * b5; b6 = pb0[6]; tmp12 += a0 * b6; tmp13 += a1 * b6; b7 = pb0[7]; tmp14 += a0 * b7; tmp15 += a1 * b7; pa0 += 2; pb0 += 8; } tmp0 = alpha * tmp0; tmp2 = alpha * tmp2; tmp4 = alpha * tmp4; tmp6 = alpha * tmp6; tmp8 = alpha * tmp8; tmp10 = alpha * tmp10; tmp12 = alpha * tmp12; tmp14 = alpha * tmp14; pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; pc3[0] += tmp6; pc4[0] += tmp8; pc5[0] += tmp10; pc6[0] += tmp12; pc7[0] += tmp14; tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; tmp7 = alpha * tmp7; tmp9 = alpha * tmp9; tmp11 = alpha * tmp11; tmp13 = alpha * tmp13; tmp15 = alpha * tmp15; pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; pc3[1] += tmp7; pc4[1] += tmp9; pc5[1] += tmp11; pc6[1] += tmp13; pc7[1] += tmp15; pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; pc4 += 2; pc5 += 2; pc6 += 2; pc7 += 2; } for (i = (m & 1); i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; b2 = pb0[2]; tmp2 = a0 * b2; b3 = pb0[3]; tmp3 = a0 * b3; b4 = pb0[4]; tmp4 = a0 * b4; b5 = pb0[5]; tmp5 = a0 * b5; b6 = pb0[6]; tmp6 = a0 * b6; b7 = pb0[7]; tmp7 = a0 * b7; pa0 += 1; pb0 += 8; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; b4 = pb0[4]; tmp4 += a0 * b4; b5 = pb0[5]; tmp5 += a0 * b5; b6 = pb0[6]; tmp6 += a0 * b6; b7 = pb0[7]; tmp7 += a0 * b7; pa0 += 1; pb0 += 8; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; b4 = pb0[4]; tmp4 += a0 * b4; b5 = pb0[5]; tmp5 += a0 * b5; b6 = pb0[6]; tmp6 += a0 * b6; b7 = pb0[7]; tmp7 += a0 * b7; pa0 += 1; pb0 += 8; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; b4 = pb0[4]; tmp4 += a0 * b4; b5 = pb0[5]; tmp5 += a0 * b5; b6 = pb0[6]; tmp6 += a0 * b6; b7 = pb0[7]; tmp7 += a0 * b7; pa0 += 1; pb0 += 8; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; tmp4 = alpha * tmp4; tmp5 = alpha * tmp5; tmp6 = alpha * tmp6; tmp7 = alpha * tmp7; pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; pc4[0] += tmp4; pc5[0] += tmp5; pc6[0] += tmp6; pc7[0] += tmp7; pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; pc4 += 1; pc5 += 1; pc6 += 1; pc7 += 1; } l = (k << 3); B = B + l; i = (ldc << 3); C = C + i; } for (j = ((n & 4) / 4); j--;) { pc0 = C; pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; pa0 = A; for (i = (m / 8); i--;) { pb0 = B; LD_SP2(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 = src_a0 * src_b; res3 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 = src_a0 * src_b; res5 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 = src_a0 * src_b; res7 = src_a1 * src_b; pa0 += 8; pb0 += 4; for (l = ((k - 1) / 2); l--;) { LD_SP2(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; pa0 += 8; pb0 += 4; LD_SP2(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; pa0 += 8; pb0 += 4; } if ((k - 1) & 1) { LD_SP2(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; pa0 += 8; pb0 += 4; } LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); LD_SP2(pc3, 4, dst6, dst7); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; dst4 += res4 * v_alpha; dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; ST_SP2(dst0, dst1, pc0, 4); ST_SP2(dst2, dst3, pc1, 4); ST_SP2(dst4, dst5, pc2, 4); ST_SP2(dst6, dst7, pc3, 4); pc0 += 8; pc1 += 8; pc2 += 8; pc3 += 8; } for (i = ((m & 4) / 4); i--;) { pb0 = B; src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 = src_a0 * src_b; pa0 += 4; pb0 += 4; for (l = ((k - 1) / 2); l--;) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; pa0 += 4; pb0 += 4; src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; pa0 += 4; pb0 += 4; } if ((k - 1) & 1) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; pa0 += 4; pb0 += 4; } dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); dst3 = LD_SP(pc3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); pc0 += 4; pc1 += 4; pc2 += 4; pc3 += 4; } for (i = ((m & 2) / 2); i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; b1 = pb0[1]; tmp2 = a0 * b1; tmp3 = a1 * b1; b2 = pb0[2]; tmp4 = a0 * b2; tmp5 = a1 * b2; b3 = pb0[3]; tmp6 = a0 * b3; tmp7 = a1 * b3; pa0 += 2; pb0 += 4; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; pa0 += 2; pb0 += 4; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; pa0 += 2; pb0 += 4; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; pa0 += 2; pb0 += 4; } tmp0 = alpha * tmp0; tmp2 = alpha * tmp2; tmp4 = alpha * tmp4; tmp6 = alpha * tmp6; pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; pc3[0] += tmp6; tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; tmp7 = alpha * tmp7; pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; pc3[1] += tmp7; pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; } for (i = (m & 1); i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; b2 = pb0[2]; tmp2 = a0 * b2; b3 = pb0[3]; tmp3 = a0 * b3; pa0 += 1; pb0 += 4; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; } l = (k << 2); B = B + l; i = (ldc << 2); C = C + i; } for (j = ((n & 2) / 2); j--;) { pc0 = C; pc1 = pc0 + ldc; pa0 = A; for (i = (m / 8); i--;) { pb0 = B; LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 = src_a0 * src_b; res3 = src_a1 * src_b; pa0 += 8; pb0 += 2; for (l = ((k - 1) / 2); l--;) { LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; pa0 += 8; pb0 += 2; LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; pa0 += 8; pb0 += 2; } if ((k - 1) & 1) { LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; pa0 += 8; pb0 += 2; } LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; ST_SP2(dst0, dst1, pc0, 4); ST_SP2(dst2, dst3, pc1, 4); pc0 += 8; pc1 += 8; } for (i = ((m & 4) / 4); i--;) { pb0 = B; src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 = src_a0 * src_b; pa0 += 4; pb0 += 2; for (l = ((k - 1) / 2); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; pa0 += 4; pb0 += 2; src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; pa0 += 4; pb0 += 2; } if ((k - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; pa0 += 4; pb0 += 2; } dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; ST_SP(dst0, pc0); ST_SP(dst1, pc1); pc0 += 4; pc1 += 4; } for (i = ((m & 2) / 2); i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; b1 = pb0[1]; tmp2 = a0 * b1; tmp3 = a1 * b1; pa0 += 2; pb0 += 2; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; pa0 += 2; pb0 += 2; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; pa0 += 2; pb0 += 2; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; pa0 += 2; pb0 += 2; } tmp0 = alpha * tmp0; tmp2 = alpha * tmp2; pc0[0] += tmp0; pc1[0] += tmp2; tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; pc0[1] += tmp1; pc1[1] += tmp3; pc0 += 2; pc1 += 2; } for (i = (m & 1); i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; pa0 += 1; pb0 += 2; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; pc0[0] += tmp0; pc1[0] += tmp1; pc0 += 1; pc1 += 1; } l = (k << 1); B = B + l; i = (ldc << 1); C = C + i; } for (j = (n & 1); j--;) { pc0 = C; pa0 = A; for (i = (m / 8); i--;) { pb0 = B; LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; pa0 += 8; pb0 += 1; for (l = ((k - 1) / 2); l--;) { LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; pa0 += 8; pb0 += 1; LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; pa0 += 8; pb0 += 1; } if ((k - 1) & 1) { LD_SP2(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; pa0 += 8; pb0 += 1; } LD_SP2(pc0, 4, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; ST_SP2(dst0, dst1, pc0, 4); pc0 += 8; } for (i = ((m & 4) / 4); i--;) { pb0 = B; src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; pa0 += 4; pb0 += 1; for (l = ((k - 1) / 2); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; pa0 += 4; pb0 += 1; src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; pa0 += 4; pb0 += 1; } if ((k - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; pa0 += 4; pb0 += 1; } dst0 = LD_SP(pc0); dst0 += res0 * v_alpha; ST_SP(dst0, pc0); pc0 += 4; } for (i = (m & 2) / 2; i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; pa0 += 2; pb0 += 1; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; pa0 += 2; pb0 += 1; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; pa0 += 2; pb0 += 1; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; pa0 += 2; pb0 += 1; } tmp0 = alpha * tmp0; pc0[0] += tmp0; tmp1 = alpha * tmp1; pc0[1] += tmp1; pc0 += 2; } for (i = (m & 1); i--;) { pb0 = B; a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; pa0 += 1; pb0 += 1; for (l = ((k - 1) / 2); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; } if ((k - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; } pc0[0] += alpha * tmp0; pc0 += 1; } l = (k << 0); B = B + l; i = (ldc << 0); C = C + i; } return 0; }