s390x: fix cscal and zscal implementationstags/v0.3.11^2
| @@ -12,5 +12,5 @@ endif | |||
| # Enable floating-point expression contraction for clang, since it is the | |||
| # default for gcc | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CCOMMON_OPT += -ffp-contract=fast | |||
| CCOMMON_OPT += -ffp-contract=on | |||
| endif | |||
| @@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* | |||
| * Avoid contraction of floating point operations, specifically fused | |||
| * multiply-add, because they can cause unexpected results in complex | |||
| * multiplication. | |||
| */ | |||
| #if defined(__GNUC__) && !defined(__clang__) | |||
| #pragma GCC optimize ("fp-contract=off") | |||
| #endif | |||
| #if defined(__clang__) | |||
| #pragma clang fp contract(off) | |||
| #endif | |||
| #include "common.h" | |||
| #include "vector-common.h" | |||
| static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||
| __asm__("vlrepf %%v0,0(%[alpha])\n\t" | |||
| "vlef %%v1,4(%[alpha]),0\n\t" | |||
| "vlef %%v1,4(%[alpha]),2\n\t" | |||
| "vflcsb %%v1,%%v1\n\t" | |||
| "vlef %%v1,4(%[alpha]),1\n\t" | |||
| "vlef %%v1,4(%[alpha]),3\n\t" | |||
| "srlg %[n],%[n],4\n\t" | |||
| "xgr %%r1,%%r1\n\t" | |||
| "0:\n\t" | |||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||
| "verllg %%v24,%%v16,32\n\t" | |||
| "verllg %%v25,%%v17,32\n\t" | |||
| "verllg %%v26,%%v18,32\n\t" | |||
| "verllg %%v27,%%v19,32\n\t" | |||
| "verllg %%v28,%%v20,32\n\t" | |||
| "verllg %%v29,%%v21,32\n\t" | |||
| "verllg %%v30,%%v22,32\n\t" | |||
| "verllg %%v31,%%v23,32\n\t" | |||
| "vfmsb %%v16,%%v16,%%v0\n\t" | |||
| "vfmsb %%v17,%%v17,%%v0\n\t" | |||
| "vfmsb %%v18,%%v18,%%v0\n\t" | |||
| "vfmsb %%v19,%%v19,%%v0\n\t" | |||
| "vfmsb %%v20,%%v20,%%v0\n\t" | |||
| "vfmsb %%v21,%%v21,%%v0\n\t" | |||
| "vfmsb %%v22,%%v22,%%v0\n\t" | |||
| "vfmsb %%v23,%%v23,%%v0\n\t" | |||
| "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" | |||
| "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" | |||
| "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" | |||
| "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" | |||
| "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" | |||
| "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" | |||
| "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" | |||
| "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" | |||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||
| "agfi %%r1,128\n\t" | |||
| "brctg %[n],0b" | |||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), | |||
| [alpha] "a"(alpha) | |||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||
| "v31"); | |||
| static void cscal_kernel_16(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { | |||
| vector_float da_r_vec = vec_splats(da_r); | |||
| vector_float da_i_vec = { -da_i, da_i, -da_i, da_i }; | |||
| vector_float *x_vec_ptr = (vector_float *)x; | |||
| #pragma GCC unroll 16 | |||
| for (size_t i = 0; i < n/2; i++) { | |||
| vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); | |||
| vector_float x_swapped = {x_vec[1], x_vec[0], x_vec[3], x_vec[2]}; | |||
| x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; | |||
| } | |||
| } | |||
| static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||
| @@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { | |||
| : "cc", "r1", "v0"); | |||
| } | |||
| static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, | |||
| static void cscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, | |||
| BLASLONG inc_x) { | |||
| BLASLONG i; | |||
| BLASLONG inc_x2 = 2 * inc_x; | |||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||
| FLOAT t0, t1, t2, t3; | |||
| FLOAT da_r = alpha[0]; | |||
| FLOAT da_i = alpha[1]; | |||
| for (i = 0; i < n; i += 4) { | |||
| t0 = da_r * x[0] - da_i * x[1]; | |||
| @@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| BLASLONG n1 = n & -8; | |||
| if (n1 > 0) { | |||
| alpha[0] = da_r; | |||
| alpha[1] = da_i; | |||
| cscal_kernel_inc_8(n1, alpha, x, inc_x); | |||
| cscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); | |||
| j = n1; | |||
| i = n1 * inc_x; | |||
| } | |||
| @@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| else if (da_i == 0) | |||
| cscal_kernel_16_zero_i(n1, alpha, x); | |||
| else | |||
| cscal_kernel_16(n1, alpha, x); | |||
| cscal_kernel_16(n1, da_r, da_i, x); | |||
| i = n1 << 1; | |||
| j = n1; | |||
| @@ -30,12 +30,13 @@ | |||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| */ | |||
| #include "common.h" | |||
| #include <vecintrin.h> | |||
| #include "vector-common.h" | |||
| #include <stdbool.h> | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef COMPLEX | |||
| #error "Handling for complex numbers is not supported in this kernel" | |||
| #endif | |||
| @@ -153,37 +154,6 @@ static const bool backwards = false; | |||
| * 3, May 2008. | |||
| */ | |||
| #define VLEN_BYTES 16 | |||
| #define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) | |||
| typedef FLOAT vector_float __attribute__ ((vector_size (16))); | |||
| /** | |||
| * Load a vector into register, and hint on 8-byte alignment to improve | |||
| * performance. gcc-9 and newer will create these hints by itself. For older | |||
| * compiler versions, use inline assembly to explicitly express the hint. | |||
| * Provide explicit hex encoding to cater for binutils versions that do not know | |||
| * about vector-load with alignment hints yet. | |||
| * | |||
| * Note that, for block sizes where we apply vectorization, vectors in A will | |||
| * always be 8-byte aligned. | |||
| */ | |||
| static inline vector_float vec_load_hinted(FLOAT const *restrict a) { | |||
| vector_float const *restrict addr = (vector_float const *restrict)a; | |||
| vector_float y; | |||
| #if __GNUC__ < 9 && !defined(__clang__) | |||
| // hex-encode vl %[out],%[addr],3 | |||
| asm(".insn vrx,0xe70000003006,%[out],%[addr],3" | |||
| : [ out ] "=v"(y) | |||
| : [ addr ] "R"(*addr)); | |||
| #else | |||
| y = *addr; | |||
| #endif | |||
| return y; | |||
| } | |||
| /** | |||
| * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. | |||
| * | |||
| @@ -0,0 +1,64 @@ | |||
| /* | |||
| * Copyright (c) IBM Corporation 2020. | |||
| * All rights reserved. | |||
| * | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written | |||
| * permission. | |||
| * | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| */ | |||
| #include <vecintrin.h> | |||
| #define VLEN_BYTES 16 | |||
| #define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) | |||
| typedef FLOAT vector_float __attribute__ ((vector_size (VLEN_BYTES))); | |||
| /** | |||
| * Load a vector into register, and hint on 8-byte alignment to improve | |||
| * performance. gcc-9 and newer will create these hints by itself. For older | |||
| * compiler versions, use inline assembly to explicitly express the hint. | |||
| * Provide explicit hex encoding to cater for binutils versions that do not know | |||
| * about vector-load with alignment hints yet. | |||
| * | |||
| * Note that, for block sizes where we apply vectorization, vectors in A will | |||
| * always be 8-byte aligned. | |||
| */ | |||
| static inline vector_float vec_load_hinted(FLOAT const *restrict a) { | |||
| vector_float const *restrict addr = (vector_float const *restrict)a; | |||
| vector_float y; | |||
| #if __GNUC__ < 9 && !defined(__clang__) | |||
| // hex-encode vl %[out],%[addr],3 | |||
| asm(".insn vrx,0xe70000003006,%[out],%[addr],3" | |||
| : [ out ] "=v"(y) | |||
| : [ addr ] "R"(*addr)); | |||
| #else | |||
| y = *addr; | |||
| #endif | |||
| return y; | |||
| } | |||
| @@ -25,65 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* | |||
| * Avoid contraction of floating point operations, specifically fused | |||
| * multiply-add, because they can cause unexpected results in complex | |||
| * multiplication. | |||
| */ | |||
| #if defined(__GNUC__) && !defined(__clang__) | |||
| #pragma GCC optimize ("fp-contract=off") | |||
| #endif | |||
| #if defined(__clang__) | |||
| #pragma clang fp contract(off) | |||
| #endif | |||
| #include "common.h" | |||
| #include "vector-common.h" | |||
| static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||
| __asm__("vlrepg %%v0,0(%[alpha])\n\t" | |||
| "vleg %%v1,8(%[alpha]),0\n\t" | |||
| "wflcdb %%v1,%%v1\n\t" | |||
| "vleg %%v1,8(%[alpha]),1\n\t" | |||
| "srlg %[n],%[n],3\n\t" | |||
| "xgr %%r1,%%r1\n\t" | |||
| "0:\n\t" | |||
| "pfd 2, 1024(%%r1,%[x])\n\t" | |||
| "vl %%v16,0(%%r1,%[x])\n\t" | |||
| "vl %%v17,16(%%r1,%[x])\n\t" | |||
| "vl %%v18,32(%%r1,%[x])\n\t" | |||
| "vl %%v19,48(%%r1,%[x])\n\t" | |||
| "vl %%v20,64(%%r1,%[x])\n\t" | |||
| "vl %%v21,80(%%r1,%[x])\n\t" | |||
| "vl %%v22,96(%%r1,%[x])\n\t" | |||
| "vl %%v23,112(%%r1,%[x])\n\t" | |||
| "vpdi %%v24,%%v16,%%v16,4\n\t" | |||
| "vpdi %%v25,%%v17,%%v17,4\n\t" | |||
| "vpdi %%v26,%%v18,%%v18,4\n\t" | |||
| "vpdi %%v27,%%v19,%%v19,4\n\t" | |||
| "vpdi %%v28,%%v20,%%v20,4\n\t" | |||
| "vpdi %%v29,%%v21,%%v21,4\n\t" | |||
| "vpdi %%v30,%%v22,%%v22,4\n\t" | |||
| "vpdi %%v31,%%v23,%%v23,4\n\t" | |||
| "vfmdb %%v16,%%v16,%%v0\n\t" | |||
| "vfmdb %%v17,%%v17,%%v0\n\t" | |||
| "vfmdb %%v18,%%v18,%%v0\n\t" | |||
| "vfmdb %%v19,%%v19,%%v0\n\t" | |||
| "vfmdb %%v20,%%v20,%%v0\n\t" | |||
| "vfmdb %%v21,%%v21,%%v0\n\t" | |||
| "vfmdb %%v22,%%v22,%%v0\n\t" | |||
| "vfmdb %%v23,%%v23,%%v0\n\t" | |||
| "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" | |||
| "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" | |||
| "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" | |||
| "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" | |||
| "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" | |||
| "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" | |||
| "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" | |||
| "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" | |||
| "vst %%v16,0(%%r1,%[x])\n\t" | |||
| "vst %%v17,16(%%r1,%[x])\n\t" | |||
| "vst %%v18,32(%%r1,%[x])\n\t" | |||
| "vst %%v19,48(%%r1,%[x])\n\t" | |||
| "vst %%v20,64(%%r1,%[x])\n\t" | |||
| "vst %%v21,80(%%r1,%[x])\n\t" | |||
| "vst %%v22,96(%%r1,%[x])\n\t" | |||
| "vst %%v23,112(%%r1,%[x])\n\t" | |||
| "agfi %%r1,128\n\t" | |||
| "brctg %[n],0b" | |||
| : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) | |||
| : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), | |||
| [alpha] "a"(alpha) | |||
| : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", | |||
| "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||
| "v31"); | |||
| static void zscal_kernel_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { | |||
| vector_float da_r_vec = vec_splats(da_r); | |||
| vector_float da_i_vec = { -da_i, da_i }; | |||
| vector_float * x_vec_ptr = (vector_float *)x; | |||
| #pragma GCC unroll 16 | |||
| for (size_t i = 0; i < n; i++) { | |||
| vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); | |||
| vector_float x_swapped = {x_vec[1], x_vec[0]}; | |||
| x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; | |||
| } | |||
| } | |||
| static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { | |||
| @@ -195,14 +165,12 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { | |||
| : "cc", "r1", "v0"); | |||
| } | |||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, | |||
| static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, | |||
| BLASLONG inc_x) { | |||
| BLASLONG i; | |||
| BLASLONG inc_x2 = 2 * inc_x; | |||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||
| FLOAT t0, t1, t2, t3; | |||
| FLOAT da_r = alpha[0]; | |||
| FLOAT da_i = alpha[1]; | |||
| for (i = 0; i < n; i += 4) { | |||
| t0 = da_r * x[0] - da_i * x[1]; | |||
| @@ -320,9 +288,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| BLASLONG n1 = n & -8; | |||
| if (n1 > 0) { | |||
| alpha[0] = da_r; | |||
| alpha[1] = da_i; | |||
| zscal_kernel_inc_8(n1, alpha, x, inc_x); | |||
| zscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); | |||
| j = n1; | |||
| i = n1 * inc_x; | |||
| } | |||
| @@ -358,7 +324,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| else if (da_i == 0) | |||
| zscal_kernel_8_zero_i(n1, alpha, x); | |||
| else | |||
| zscal_kernel_8(n1, alpha, x); | |||
| zscal_kernel_8(n1, da_r, da_i, x); | |||
| i = n1 << 1; | |||
| j = n1; | |||