Browse Source

Merge branch 'xianyi:develop' into cirrusjobs

pull/3997/head
Martin Kroeker GitHub 3 years ago
parent
commit
d6a7809504
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 84 additions and 48 deletions
  1. +1
    -0
      c_check
  2. +2
    -0
      common_param.h
  3. +3
    -3
      cpuid_arm64.c
  4. +9
    -6
      driver/level3/level3_gemm3m_thread.c
  5. +8
    -5
      driver/level3/level3_syrk_threaded.c
  6. +19
    -12
      driver/level3/level3_thread.c
  7. +1
    -1
      kernel/arm64/cgemm_kernel_8x4.S
  8. +1
    -1
      kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
  9. +4
    -4
      kernel/arm64/ctrmm_kernel_8x4.S
  10. +3
    -2
      kernel/arm64/dznrm2_thunderx2t99.c
  11. +4
    -1
      kernel/setparam-ref.c
  12. +1
    -1
      kernel/x86_64/sgemv_n_4.c
  13. +1
    -1
      kernel/x86_64/sgemv_t_4.c
  14. +1
    -1
      kernel/x86_64/ssymv_L.c
  15. +1
    -1
      kernel/x86_64/ssymv_U.c
  16. +1
    -1
      kernel/x86_64/zdot.c
  17. +1
    -1
      kernel/x86_64/zgemv_n_4.c
  18. +1
    -1
      kernel/x86_64/zgemv_t_4.c
  19. +0
    -4
      lapack/potrf/potrf_parallel.c
  20. +22
    -2
      param.h

+ 1
- 0
c_check View File

@@ -40,6 +40,7 @@ bn=`basename \"$compiler_name\"`
case "$bn" in
*-*) if [ "$bn" != '-' ]; then
cross_suffix="$cross_suffix${bn%-*}-"
cross_suffix=`echo $cross_suffix|sed -e 's/ -$//'`
fi
esac



+ 2
- 0
common_param.h View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -45,6 +46,7 @@

typedef struct {
int dtb_entries;
int switch_ratio;
int offsetA, offsetB, align;

#if BUILD_BFLOAT16 == 1


+ 3
- 3
cpuid_arm64.c View File

@@ -267,9 +267,9 @@ int detect(void)
}
#else
#ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1
if (value == 3660830781) return CPU_VORTEX; //A15/M2
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
#endif
return CPU_ARMV8;
#endif


+ 9
- 6
driver/level3/level3_gemm3m_thread.c View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif

#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
BLASLONG divN, divT;
int mode;

#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif

if (range_m) {
BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
@@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
}
*/

if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}
@@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
divT = nthreads;
divN = 1;

while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
do {
divT --;
divN = 1;


+ 8
- 5
driver/level3/level3_syrk_threaded.c View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif

#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
int mode, mask;
double dnum, di, dinum;

if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif

if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}


+ 19
- 12
driver/level3/level3_thread.c View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif

#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif

#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
@@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
BLASLONG width, i, j, k, js;
BLASLONG m, n, n_from, n_to;
int mode;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif

/* Get execution mode */
#ifndef COMPLEX
@@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
num_parts = 0;
while (n > 0){
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
if (width < SWITCH_RATIO) {
width = SWITCH_RATIO;
if (width < switch_ratio) {
width = switch_ratio;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);

@@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
BLASLONG m = args -> m;
BLASLONG n = args -> n;
BLASLONG nthreads_m, nthreads_n;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif

/* Get dimensions from index ranges if available */
if (range_m) {
@@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
n = range_n[1] - range_n[0];
}

/* Partitions in m should have at least SWITCH_RATIO rows */
if (m < 2 * SWITCH_RATIO) {
/* Partitions in m should have at least switch_ratio rows */
if (m < 2 * switch_ratio) {
nthreads_m = 1;
} else {
nthreads_m = args -> nthreads;
while (m < nthreads_m * SWITCH_RATIO) {
while (m < nthreads_m * switch_ratio) {
nthreads_m = nthreads_m / 2;
}
}

/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
if (n < SWITCH_RATIO * nthreads_m) {
/* Partitions in n should have at most switch_ratio * nthreads_m columns */
if (n < switch_ratio * nthreads_m) {
nthreads_n = 1;
} else {
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
if (nthreads_m * nthreads_n > args -> nthreads) {
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
}


+ 1
- 1
kernel/arm64/cgemm_kernel_8x4.S View File

@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define alphaI w19

#define alpha0_R s10
#define alphaV0_R v10.s[0]


+ 1
- 1
kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S View File

@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define alphaI w19

#define alpha0_R s10
#define alphaV0_R v10.s[0]


+ 4
- 4
kernel/arm64/ctrmm_kernel_8x4.S View File

@@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define temp x19
#define tempOffset x20
#define tempK x21
#define alphaI w19
#define temp x20
#define tempOffset x21
#define tempK x22

#define alpha0_R s10
#define alphaV0_R v10.s[0]


+ 3
- 2
kernel/arm64/dznrm2_thunderx2t99.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common.h"
#include <float.h>
#include <arm_neon.h>

#if defined(SMP)
@@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#else
nrm2_compute(n, x, inc_x, &ssq, &scale);
#endif
if (fabs(scale) <1.e-300) return 0.;
volatile FLOAT sca = fabs(scale);
if (sca < DBL_MIN) return 0.;
ssq = sqrt(ssq) * scale;

return ssq;


+ 4
- 1
kernel/setparam-ref.c View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -49,7 +50,9 @@
static void init_parameter(void);

gotoblas_t TABLE_NAME = {
DTB_DEFAULT_ENTRIES ,
DTB_DEFAULT_ENTRIES,

SWITCH_RATIO,

GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,



+ 1
- 1
kernel/x86_64/sgemv_n_4.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
#if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize")
#endif



+ 1
- 1
kernel/x86_64/sgemv_t_4.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
#if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize")
#endif



+ 1
- 1
kernel/x86_64/ssymv_L.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
#if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize")
#endif



+ 1
- 1
kernel/x86_64/ssymv_U.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
#if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize")
#endif



+ 1
- 1
kernel/x86_64/zdot.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
#if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize")
#endif



+ 1
- 1
kernel/x86_64/zgemv_n_4.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
#if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize")
#endif



+ 1
- 1
kernel/x86_64/zgemv_t_4.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
#if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize")
#endif



+ 0
- 4
lapack/potrf/potrf_parallel.c View File

@@ -80,10 +80,6 @@ static FLOAT dm1 = -1.;
#define DIVIDE_RATE 2
#endif

#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif

#ifndef LOWER
#define TRANS
#endif


+ 22
- 2
param.h View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
Copyright (c) 2011-2023, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
@@ -3338,6 +3338,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d

#elif defined(NEOVERSEN1)

#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16
#endif

#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4

@@ -3367,7 +3373,11 @@ is a big desktop or server with abundant cache rather than a phone or embedded d

#elif defined(NEOVERSEV1)

#define SWITCH_RATIO 16
#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16
#endif

#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -3398,6 +3408,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d

#elif defined(NEOVERSEN2)

#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16
#endif

#undef SBGEMM_ALIGN_K
#define SBGEMM_ALIGN_K 4

@@ -3838,6 +3854,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout

#endif

#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif

#ifndef QGEMM_DEFAULT_UNROLL_M
#define QGEMM_DEFAULT_UNROLL_M 2
#endif


Loading…
Cancel
Save