Added generic 4x4 and 4x2 gemm kernels Added generic 4x2 trmm kerneltags/v0.2.20^2
| @@ -0,0 +1,317 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <stdbool.h> | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0_0; | |||
| FLOAT res0_1; | |||
| FLOAT res0_2; | |||
| FLOAT res0_3; | |||
| FLOAT res1_0; | |||
| FLOAT res1_1; | |||
| FLOAT res1_2; | |||
| FLOAT res1_3; | |||
| FLOAT a0; | |||
| FLOAT a1; | |||
| FLOAT b0; | |||
| FLOAT b1; | |||
| for (j=0; j<(bn/2); j+=2) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res0_2 = 0; | |||
| res0_3 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| res1_2 = 0; | |||
| res1_3 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| a0 = ptrba[2]; | |||
| res0_2 += a0*b0; | |||
| res1_2 += a0*b1; | |||
| a1 = ptrba[3]; | |||
| res0_3 += a1*b0; | |||
| res1_3 += a1*b1; | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res0_2 *= alpha; | |||
| res0_3 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| res1_2 *= alpha; | |||
| res1_3 *= alpha; | |||
| C0[0] += res0_0; | |||
| C0[1] += res0_1; | |||
| C0[2] += res0_2; | |||
| C0[3] += res0_3; | |||
| C1[0] += res1_0; | |||
| C1[1] += res1_1; | |||
| C1[2] += res1_2; | |||
| C1[3] += res1_3; | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| } | |||
| if ( bm & 2 ) | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C1[0] = res1_0; | |||
| C1[1] = res1_1; | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| if ( bm & 1 ) | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res1_0 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res1_0 *= alpha; | |||
| C0[0] = res0_0; | |||
| C1[0] = res1_0; | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| } | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) // do the Mx1 loops | |||
| { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res0_2 = 0; | |||
| res0_3 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| a0 = ptrba[2]; | |||
| res0_2 += a0*b0; | |||
| a1 = ptrba[3]; | |||
| res0_3 += a1*b0; | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res0_2 *= alpha; | |||
| res0_3 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0[2] = res0_2; | |||
| C0[3] = res0_3; | |||
| C0 = C0+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x1 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0 = C0+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x1 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0 = C0+1; | |||
| } | |||
| k = (bk<<0); | |||
| bb = bb+k; | |||
| C = C+ldc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,571 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <stdbool.h> | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; | |||
| FLOAT res0_0; | |||
| FLOAT res0_1; | |||
| FLOAT res0_2; | |||
| FLOAT res0_3; | |||
| FLOAT res1_0; | |||
| FLOAT res1_1; | |||
| FLOAT res1_2; | |||
| FLOAT res1_3; | |||
| FLOAT res2_0; | |||
| FLOAT res2_1; | |||
| FLOAT res2_2; | |||
| FLOAT res2_3; | |||
| FLOAT res3_0; | |||
| FLOAT res3_1; | |||
| FLOAT res3_2; | |||
| FLOAT res3_3; | |||
| FLOAT a0; | |||
| FLOAT a1; | |||
| FLOAT b0; | |||
| FLOAT b1; | |||
| FLOAT b2; | |||
| FLOAT b3; | |||
| for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| C2 = C1+ldc; | |||
| C3 = C2+ldc; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x4 | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res0_2 = 0; | |||
| res0_3 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| res1_2 = 0; | |||
| res1_3 = 0; | |||
| res2_0 = 0; | |||
| res2_1 = 0; | |||
| res2_2 = 0; | |||
| res2_3 = 0; | |||
| res3_0 = 0; | |||
| res3_1 = 0; | |||
| res3_2 = 0; | |||
| res3_3 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| b2 = ptrbb[2]; | |||
| b3 = ptrbb[3]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| res2_0 += a0*b2; | |||
| res3_0 += a0*b3; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| res2_1 += a1*b2; | |||
| res3_1 += a1*b3; | |||
| a0 = ptrba[2]; | |||
| res0_2 += a0*b0; | |||
| res1_2 += a0*b1; | |||
| res2_2 += a0*b2; | |||
| res3_2 += a0*b3; | |||
| a1 = ptrba[3]; | |||
| res0_3 += a1*b0; | |||
| res1_3 += a1*b1; | |||
| res2_3 += a1*b2; | |||
| res3_3 += a1*b3; | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+4; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res0_2 *= alpha; | |||
| res0_3 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| res1_2 *= alpha; | |||
| res1_3 *= alpha; | |||
| res2_0 *= alpha; | |||
| res2_1 *= alpha; | |||
| res2_2 *= alpha; | |||
| res2_3 *= alpha; | |||
| res3_0 *= alpha; | |||
| res3_1 *= alpha; | |||
| res3_2 *= alpha; | |||
| res3_3 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0[2] = res0_2; | |||
| C0[3] = res0_3; | |||
| C1[0] = res1_0; | |||
| C1[1] = res1_1; | |||
| C1[2] = res1_2; | |||
| C1[3] = res1_3; | |||
| C2[0] = res2_0; | |||
| C2[1] = res2_1; | |||
| C2[2] = res2_2; | |||
| C2[3] = res2_3; | |||
| C3[0] = res3_0; | |||
| C3[1] = res3_1; | |||
| C3[2] = res3_2; | |||
| C3[3] = res3_3; | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| C2 = C2+4; | |||
| C3 = C3+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x4 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| res2_0 = 0; | |||
| res2_1 = 0; | |||
| res3_0 = 0; | |||
| res3_1 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| b2 = ptrbb[2]; | |||
| b3 = ptrbb[3]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| res2_0 += a0*b2; | |||
| res3_0 += a0*b3; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| res2_1 += a1*b2; | |||
| res3_1 += a1*b3; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+4; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| res2_0 *= alpha; | |||
| res2_1 *= alpha; | |||
| res3_0 *= alpha; | |||
| res3_1 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C1[0] = res1_0; | |||
| C1[1] = res1_1; | |||
| C2[0] = res2_0; | |||
| C2[1] = res2_1; | |||
| C3[0] = res3_0; | |||
| C3[1] = res3_1; | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| C2 = C2+2; | |||
| C3 = C3+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x4 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res1_0 = 0; | |||
| res2_0 = 0; | |||
| res3_0 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| b2 = ptrbb[2]; | |||
| b3 = ptrbb[3]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| res2_0 += a0*b2; | |||
| res3_0 += a0*b3; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+4; | |||
| } | |||
| res0_0 *= alpha; | |||
| res1_0 *= alpha; | |||
| res2_0 *= alpha; | |||
| res3_0 *= alpha; | |||
| C0[0] = res0_0; | |||
| C1[0] = res1_0; | |||
| C2[0] = res2_0; | |||
| C3[0] = res3_0; | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| C2 = C2+1; | |||
| C3 = C3+1; | |||
| } | |||
| k = (bk<<2); | |||
| bb = bb+k; | |||
| i = (ldc<<2); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&2); j+=2) // do the Mx2 loops | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x2 | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res0_2 = 0; | |||
| res0_3 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| res1_2 = 0; | |||
| res1_3 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| a0 = ptrba[2]; | |||
| res0_2 += a0*b0; | |||
| res1_2 += a0*b1; | |||
| a1 = ptrba[3]; | |||
| res0_3 += a1*b0; | |||
| res1_3 += a1*b1; | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res0_2 *= alpha; | |||
| res0_3 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| res1_2 *= alpha; | |||
| res1_3 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0[2] = res0_2; | |||
| C0[3] = res0_3; | |||
| C1[0] = res1_0; | |||
| C1[1] = res1_1; | |||
| C1[2] = res1_2; | |||
| C1[3] = res1_3; | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x2 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C1[0] = res1_0; | |||
| C1[1] = res1_1; | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x2 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res1_0 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res1_0 *= alpha; | |||
| C0[0] = res0_0; | |||
| C1[0] = res1_0; | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| } | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) // do the Mx1 loops | |||
| { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res0_2 = 0; | |||
| res0_3 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| a0 = ptrba[2]; | |||
| res0_2 += a0*b0; | |||
| a1 = ptrba[3]; | |||
| res0_3 += a1*b0; | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res0_2 *= alpha; | |||
| res0_3 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0[2] = res0_2; | |||
| C0[3] = res0_3; | |||
| C0 = C0+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x1 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0 = C0+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x1 loop | |||
| { | |||
| ptrbb = bb; | |||
| res0_0 = 0; | |||
| for (k=0; k<bk; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0 = C0+1; | |||
| } | |||
| k = (bk<<0); | |||
| bb = bb+k; | |||
| C = C+ldc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,528 @@ | |||
| #include "common.h" | |||
| #include <stdbool.h> | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0_0; | |||
| FLOAT res0_1; | |||
| FLOAT res0_2; | |||
| FLOAT res0_3; | |||
| FLOAT res1_0; | |||
| FLOAT res1_1; | |||
| FLOAT res1_2; | |||
| FLOAT res1_3; | |||
| FLOAT a0; | |||
| FLOAT a1; | |||
| FLOAT b0; | |||
| FLOAT b1; | |||
| BLASLONG off, temp; | |||
| bool left; | |||
| bool transposed; | |||
| bool backwards; | |||
| #ifdef LEFT | |||
| left = true; | |||
| #else | |||
| left = false; | |||
| #endif | |||
| #ifdef TRANSA | |||
| transposed = true; | |||
| #else | |||
| transposed = false; | |||
| #endif | |||
| backwards = left != transposed; | |||
| if (!left) { | |||
| off = -offset; | |||
| } | |||
| for (j=0; j<(bn/2); j+=2) // do the Mx2 loops | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x2 | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*4; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res0_2 = 0; | |||
| res0_3 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| res1_2 = 0; | |||
| res1_3 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+4; // number of values in A | |||
| #else | |||
| temp = off+2; // number of values in B | |||
| #endif | |||
| for (k=0; k<temp; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| a0 = ptrba[2]; | |||
| res0_2 += a0*b0; | |||
| res1_2 += a0*b1; | |||
| a1 = ptrba[3]; | |||
| res0_3 += a1*b0; | |||
| res1_3 += a1*b1; | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res0_2 *= alpha; | |||
| res0_3 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| res1_2 *= alpha; | |||
| res1_3 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0[2] = res0_2; | |||
| C0[3] = res0_3; | |||
| C1[0] = res1_0; | |||
| C1[1] = res1_1; | |||
| C1[2] = res1_2; | |||
| C1[3] = res1_3; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 4; // number of values in A | |||
| #else | |||
| temp -= 2; // number of values in B | |||
| #endif | |||
| ptrba += temp*4; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 4; // number of values in A | |||
| #endif | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x2 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res1_0 = 0; | |||
| res1_1 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; // number of values in A | |||
| #else | |||
| temp = off+2; // number of values in B | |||
| #endif | |||
| for (k=0; k<temp; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| res1_1 += a1*b1; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res1_0 *= alpha; | |||
| res1_1 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C1[0] = res1_0; | |||
| C1[1] = res1_1; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; // number of values in A | |||
| #else | |||
| temp -= 2; // number of values in B | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; // number of values in A | |||
| #endif | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x2 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*1; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| res0_0 = 0; | |||
| res1_0 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; // number of values in A | |||
| #else | |||
| temp = off+2; // number of values in B | |||
| #endif | |||
| for (k=0; k<temp; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| b1 = ptrbb[1]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| res1_0 += a0*b1; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| res0_0 *= alpha; | |||
| res1_0 *= alpha; | |||
| C0[0] = res0_0; | |||
| C1[0] = res1_0; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; // number of values in A | |||
| #else | |||
| temp -= 2; // number of values in B | |||
| #endif | |||
| ptrba += temp*1; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; | |||
| #endif | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) // do the Mx1 loops | |||
| { | |||
| C0 = C; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*4; | |||
| ptrbb = bb + off*1; | |||
| #endif | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| res0_2 = 0; | |||
| res0_3 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+4; // number of values in A | |||
| #else | |||
| temp = off+1; // number of values in B | |||
| #endif | |||
| for (k=0; k<temp; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| a0 = ptrba[2]; | |||
| res0_2 += a0*b0; | |||
| a1 = ptrba[3]; | |||
| res0_3 += a1*b0; | |||
| ptrba = ptrba+4; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| res0_2 *= alpha; | |||
| res0_3 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| C0[2] = res0_2; | |||
| C0[3] = res0_3; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 4; // number of values in A | |||
| #else | |||
| temp -= 1; // number of values in B | |||
| #endif | |||
| ptrba += temp*4; | |||
| ptrbb += temp*1; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 4; // number of values in A | |||
| #endif | |||
| C0 = C0+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x1 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*1; | |||
| #endif | |||
| res0_0 = 0; | |||
| res0_1 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; // number of values in A | |||
| #else | |||
| temp = off+1; // number of values in B | |||
| #endif | |||
| for (k=0; k<temp; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| a1 = ptrba[1]; | |||
| res0_1 += a1*b0; | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| res0_1 *= alpha; | |||
| C0[0] = res0_0; | |||
| C0[1] = res0_1; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; // number of values in A | |||
| #else | |||
| temp -= 1; // number of values in B | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*1; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; // number of values in A | |||
| #endif | |||
| C0 = C0+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x1 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*1; | |||
| ptrbb = bb + off*1; | |||
| #endif | |||
| res0_0 = 0; | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; // number of values in A | |||
| #else | |||
| temp = off+1; // number of values in B | |||
| #endif | |||
| for (k=0; k<temp; k++) | |||
| { | |||
| b0 = ptrbb[0]; | |||
| a0 = ptrba[0]; | |||
| res0_0 += a0*b0; | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| res0_0 *= alpha; | |||
| C0[0] = res0_0; | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; // number of values in A | |||
| #else | |||
| temp -= 1; // number of values in B | |||
| #endif | |||
| ptrba += temp*1; | |||
| ptrbb += temp*1; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 1; | |||
| #endif | |||
| k = (bk<<0); | |||
| bb = bb+k; | |||
| C = C+ldc; | |||
| } | |||
| return 0; | |||
| } | |||