fix: resolve the compilation failure without zfh instruction

- modify the macro conditions in Makefile.system - Delete development test code Related to issue#5279
7 months ago · 4e1a381e5b
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -309,7 +309,7 @@ COMMON_PROF = -pg
 # BUILD_BFLOAT16 = 1

 # If you want to enable the experimental HFLOAT16 support
 BUILD_HFLOAT16 = 1
 # BUILD_HFLOAT16 = 1

 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS
 # will be allocated on the heap rather than the stack. (This array alone requires 
--- a/Makefile.system
+++ b/Makefile.system
@@ -280,7 +280,6 @@ GEMM_GEMV_FORWARD_BF16 = 1
 endif
 ifeq ($(ARCH), riscv)
 GEMM_GEMV_FORWARD = 1
 BUILD_HFLOAT16 = 1
 endif
 ifeq ($(ARCH), power)
 GEMM_GEMV_FORWARD = 1
--- a/install/generate.py
+++ b/install/generate.py
@@ -1,58 +0,0 @@
 import numpy as np
 import torch
 # 设置矩阵尺寸
 M, K, N = 31, 31, 31  # 可修改为更大规模

 # 生成随机输入矩阵，类型为float16
 A = np.random.randint(0, 11, size=(M, K)).astype(np.float16)
 B = np.random.randint(0, 11, size=(K, N)).astype(np.float16)
 A_torch = torch.tensor(A, dtype=torch.float16, device='cuda')
 B_torch = torch.tensor(B, dtype=torch.float16, device='cuda')
 C_torch = torch.matmul(A_torch, B_torch)
 C_ref = C_torch.cpu().numpy().astype(np.float32)

 def format_array_c(name, array, c_type="hfloat16"):
    flat = array.flatten()
    elements = ", ".join(f"{x:.5f}" for x in flat)
    return f"{c_type} {name}[{len(flat)}] = {{ {elements} }};\n"

 def format_array_c_float(name, array):
    flat = array.flatten()
    elements = ", ".join(f"{x:.5f}" for x in flat)
    return f"float {name}[{len(flat)}] = {{ {elements} }};\n"

 # 写入C文件
 with open("generated_test.c", "w") as f:
    f.write('#include <stdio.h>\n')
    f.write('#include <stdlib.h>\n')
    f.write('#include <string.h>\n')
    f.write('#include <cblas.h>\n\n')

    f.write(f"const int M = {M}, K = {K}, N = {N};\n")
    f.write("const float alpha = 1.0f, beta = 0.0f;\n\n")

    f.write(format_array_c("A", A))
    f.write(format_array_c("B", B))
    f.write(f"float C[{M*N}] = {{ 0 }};\n\n")

    f.write("int main() {\n")
    f.write("    cblas_shgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,\n")
    f.write("                 M, N, K,\n")
    f.write("                 alpha,\n")
    f.write("                 A, K,\n")
    f.write("                 B, N,\n")
    f.write("                 beta,\n")
    f.write("                 C, N);\n\n")

    f.write('    printf("Result C = A * B:\\n");\n')
    f.write("    for (int i = 0; i < M * N; i++) {\n")
    f.write("        printf(\"%.5f \", C[i]);\n")
    f.write("        if ((i + 1) % N == 0) printf(\"\\n\");\n")
    f.write("    }\n")
    f.write("    return 0;\n")
    f.write("}\n\n")

    f.write("// Reference result computed in Python:\n")
    c_ref_flat = ", ".join(f"{x:.5f}" for x in C_ref.flatten())
    f.write(f"// C_ref = {{ {c_ref_flat} }}\n")

--- a/install/generated_test
+++ b/install/generated_test
--- a/install/generated_test.c
+++ b/install/generated_test.c
--- a/install/include/cblas.h
+++ b/install/include/cblas.h
@@ -1,457 +0,0 @@
 #ifndef CBLAS_H
 #define CBLAS_H

 #include <stddef.h>
 #include "openblas_config.h"

 #ifdef __cplusplus
 extern "C" {
 	/* Assume C declarations for C++ */
 #endif  /* __cplusplus */

 /*Set the number of threads on runtime.*/
 void openblas_set_num_threads(int num_threads);
 void goto_set_num_threads(int num_threads);
 int openblas_set_num_threads_local(int num_threads);

 /*Get the number of threads on runtime.*/
 int openblas_get_num_threads(void);

 /*Get the number of physical processors (cores).*/
 int openblas_get_num_procs(void);

 /*Get the build configure on runtime.*/
 char* openblas_get_config(void);

 /*Get the CPU corename on runtime.*/
 char* openblas_get_corename(void);

 /*Set the threading backend to a custom callback.*/
 typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
 typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
 void openblas_set_threads_callback_function(openblas_threads_callback callback);

 #ifdef OPENBLAS_OS_LINUX
 /* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
 int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
 /* Queries thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
 int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
 #endif

 /* Get the parallelization type which is used by OpenBLAS */
 int openblas_get_parallel(void);
 /* OpenBLAS is compiled for sequential use  */
 #define OPENBLAS_SEQUENTIAL  0
 /* OpenBLAS is compiled using normal threading model */
 #define OPENBLAS_THREAD  1
 /* OpenBLAS is compiled using OpenMP threading model */
 #define OPENBLAS_OPENMP 2


 /*
 * Since all of GotoBlas was written without const,
 * we disable it at build time.
 */
 #ifndef OPENBLAS_CONST
 # define OPENBLAS_CONST const
 #endif


 #define CBLAS_INDEX size_t

 typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
 typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
 typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
 typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
 typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
 typedef CBLAS_ORDER CBLAS_LAYOUT;
 	
 float  cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
 double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
 float  cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
 double cblas_ddot(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);

 openblas_complex_float  cblas_cdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy);
 openblas_complex_float  cblas_cdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy);
 openblas_complex_double cblas_zdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);
 openblas_complex_double cblas_zdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);

 void  cblas_cdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy, void  *ret);
 void  cblas_cdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy, void  *ret);
 void  cblas_zdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);
 void  cblas_zdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);

 float  cblas_sasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float  cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 float  cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float  cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 float  cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
 double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
 float  cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX);
 double cblas_dznrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);

 CBLAS_INDEX cblas_isamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

 void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

 void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

 void cblas_sswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_cswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

 void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
 void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double  s);
 void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
 void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);

 void cblas_srotg(float *a, float *b, float *c, float *s);
 void cblas_drotg(double *a, double *b, double *c, double *s);
 void cblas_crotg(void *a, void *b, float *c, void *s);
 void cblas_zrotg(void *a, void *b, double *c, void *s);


 void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
 void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);

 void cblas_srotmg(float *d1, float *d2, float *b1, OPENBLAS_CONST float b2, float *P);
 void cblas_drotmg(double *d1, double *d2, double *b1, OPENBLAS_CONST double b2, double *P);

 void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX);
 void cblas_cscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX);
 void cblas_zscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX);
 void cblas_csscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, void *X, OPENBLAS_CONST blasint incX);
 void cblas_zdscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, void *X, OPENBLAS_CONST blasint incX);

 void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float beta,  float  *y, OPENBLAS_CONST blasint incy);
 void cblas_dgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST double  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST double beta,  double  *y, OPENBLAS_CONST blasint incy);
 void cblas_cgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST void *beta,  void  *y, OPENBLAS_CONST blasint incy);
 void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST void *beta,  void  *y, OPENBLAS_CONST blasint incy);

 void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float   alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
 void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double  alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
 void cblas_cgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *alpha, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void  *Y, OPENBLAS_CONST blasint incY, void  *A, OPENBLAS_CONST blasint lda);
 void cblas_cgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *alpha, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void  *Y, OPENBLAS_CONST blasint incY, void  *A, OPENBLAS_CONST blasint lda);
 void cblas_zgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
 void cblas_zgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);

 void cblas_strsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

 void cblas_strmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

 void cblas_ssyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda);
 void cblas_dsyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda);
 void cblas_cher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda);
 void cblas_zher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda);

 void cblas_ssyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo,OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X,
                OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
 void cblas_dsyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,
                OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
 void cblas_cher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX,
                OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
 void cblas_zher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX,
                OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);

 void cblas_sgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
 void cblas_cgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);

 void cblas_ssbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);


 void cblas_stbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

 void cblas_stbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

 void cblas_stpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);

 void cblas_stpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);

 void cblas_ssymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dsymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
 void cblas_chemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zhemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);


 void cblas_sspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *Ap,
                 OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *Ap,
                 OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);

 void cblas_sspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *Ap);
 void cblas_dspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *Ap);

 void cblas_chpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A);
 void cblas_zhpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X,OPENBLAS_CONST blasint incX, void *A);

 void cblas_sspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A);
 void cblas_dspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A);
 void cblas_chpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap);
 void cblas_zhpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap);

 void cblas_chbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zhbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);

 void cblas_chpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zhpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);

 void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_csymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_ssyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_csyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_ssyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_csyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_strmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
 void cblas_dtrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
 void cblas_ctrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
 void cblas_ztrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);

 void cblas_strsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
 void cblas_dtrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
 void cblas_ctrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
 void cblas_ztrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);

 void cblas_chemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zhemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_cherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                 OPENBLAS_CONST float alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                 OPENBLAS_CONST double alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                  OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                  OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);

 /*** BLAS extensions ***/

 void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);

 void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy);

 void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);

 void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);

 void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, 
 		     OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb); 
 void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a,
 		     OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); 
 void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a, 
 		     OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb); 
 void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, 
 		     OPENBLAS_CONST blasint clda,  double *b, OPENBLAS_CONST blasint cldb); 

 void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 
 void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a,
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 
 void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 
 void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 

 void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
 void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 
 void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
 void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 

 void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST float ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST float ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

 void cblas_dgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST double * alpha_array, OPENBLAS_CONST double ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST double ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST double * beta_array, double ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

 void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

 void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

 /*** BFLOAT16 and INT8 extensions ***/
 /* convert float array to BFLOAT16 array by rounding */
 void   cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
 /* convert double array to BFLOAT16 array by rounding */
 void   cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
 /* convert BFLOAT16 array to float array */
 void   cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float  *out, OPENBLAS_CONST blasint incout);
 /* convert BFLOAT16 array to double array */
 void   cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
 /* dot production of BFLOAT16 input arrays, and output as float */
 float  cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
 void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);

 void   cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		    OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

 /*** FLOAT16 extensions ***/
 void cblas_shgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		    OPENBLAS_CONST float alpha, OPENBLAS_CONST hfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST hfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);

 #ifdef __cplusplus
 }
 #endif  /* __cplusplus */

 #endif
--- a/install/include/f77blas.h
+++ b/install/include/f77blas.h
@@ -1,811 +0,0 @@
 #ifndef OPENBLAS_F77BLAS_H
 #define OPENBLAS_F77BLAS_H
 #include "openblas_config.h"
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #ifndef ASSEMBLER

 #ifdef __cplusplus
 extern "C" {
 	/* Assume C declarations for C++ */
 #endif  /* __cplusplus */

 int    BLASFUNC(xerbla)(char *, blasint *info, blasint);

 void    openblas_set_num_threads_(int *);

 /*Set the threading backend to a custom callback.*/
 typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
 typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
 extern openblas_threads_callback openblas_threads_callback_;

 FLOATRET  BLASFUNC(sdot)  (blasint *, float  *, blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(sdsdot)(blasint *, float  *,        float  *, blasint *, float  *, blasint *);

 double BLASFUNC(dsdot) (blasint *, float  *, blasint *, float  *, blasint *);
 double BLASFUNC(ddot)  (blasint *, double *, blasint *, double *, blasint *);
 xdouble BLASFUNC(qdot)  (blasint *, xdouble *, blasint *, xdouble *, blasint *);

 float  BLASFUNC(sbdot)     (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
 void   BLASFUNC(sbstobf16) (blasint *, float *,    blasint *, bfloat16 *, blasint *);
 void   BLASFUNC(sbdtobf16) (blasint *, double *,   blasint *, bfloat16 *, blasint *);
 void   BLASFUNC(sbf16tos)  (blasint *, bfloat16 *, blasint *, float *,    blasint *);
 void   BLASFUNC(dbf16tod)  (blasint *, bfloat16 *, blasint *, double *,   blasint *);

 #ifdef RETURN_BY_STRUCT
 typedef struct {
  float r, i;
 } myccomplex_t;

 typedef struct {
  double r, i;
 } myzcomplex_t;

 typedef struct {
  xdouble r, i;
 } myxcomplex_t;

 myccomplex_t    BLASFUNC(cdotu)  (blasint *, float  *, blasint *, float  *, blasint *);
 myccomplex_t    BLASFUNC(cdotc)  (blasint *, float  *, blasint *, float  *, blasint *);
 myzcomplex_t    BLASFUNC(zdotu)  (blasint *, double  *, blasint *, double  *, blasint *);
 myzcomplex_t    BLASFUNC(zdotc)  (blasint *, double  *, blasint *, double  *, blasint *);
 myxcomplex_t    BLASFUNC(xdotu)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 myxcomplex_t    BLASFUNC(xdotc)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);

 #elif defined RETURN_BY_STACK
 void  BLASFUNC(cdotu)  (openblas_complex_float   *,  blasint *, float  * , blasint *, float  *,  blasint *);
 void  BLASFUNC(cdotc)  (openblas_complex_float   *,  blasint *, float  *,  blasint *, float  *,  blasint *);
 void  BLASFUNC(zdotu)  (openblas_complex_double  *, blasint *, double  *, blasint *, double  *, blasint *);
 void  BLASFUNC(zdotc)  (openblas_complex_double  *, blasint *, double  *, blasint *, double  *, blasint *);
 void  BLASFUNC(xdotu)  (openblas_complex_xdouble *, blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 void  BLASFUNC(xdotc)  (openblas_complex_xdouble *, blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 #else
 openblas_complex_float   BLASFUNC(cdotu)  (blasint *, float  *, blasint *, float  *, blasint *);
 openblas_complex_float   BLASFUNC(cdotc)  (blasint *, float  *, blasint *, float  *, blasint *);
 openblas_complex_double  BLASFUNC(zdotu)  (blasint *, double  *, blasint *, double  *, blasint *);
 openblas_complex_double  BLASFUNC(zdotc)  (blasint *, double  *, blasint *, double  *, blasint *);
 openblas_complex_xdouble BLASFUNC(xdotu)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 openblas_complex_xdouble BLASFUNC(xdotc)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 #endif

 void    BLASFUNC(saxpy) (blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(qaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(caxpy) (blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(caxpyc)(blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zaxpyc)(blasint *, double *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xaxpyc)(blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);

 void    BLASFUNC(scopy) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(dcopy) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(qcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(ccopy) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zcopy) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *);

 void    BLASFUNC(sswap) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(dswap) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(cswap) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zswap) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(sasum) (blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(scasum)(blasint *, float  *, blasint *);
 double BLASFUNC(dasum) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dzasum)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(ssum) (blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(scsum)(blasint *, float  *, blasint *);
 double BLASFUNC(dsum) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dzsum)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);

 blasint    BLASFUNC(isamax)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idamax)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icamax)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(izamax)(blasint *, double *, blasint *);
 blasint    BLASFUNC(ixamax)(blasint *, xdouble *, blasint *);

 blasint    BLASFUNC(ismax) (blasint *, float  *, blasint *);
 blasint    BLASFUNC(idmax) (blasint *, double *, blasint *);
 blasint    BLASFUNC(iqmax) (blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icmax) (blasint *, float  *, blasint *);
 blasint    BLASFUNC(izmax) (blasint *, double *, blasint *);
 blasint    BLASFUNC(ixmax) (blasint *, xdouble *, blasint *);

 blasint    BLASFUNC(isamin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idamin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqamin)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icamin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(izamin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(ixamin)(blasint *, xdouble *, blasint *);

 blasint    BLASFUNC(ismin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idmin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqmin)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icmin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(izmin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(ixmin)(blasint *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(samax) (blasint *, float  *, blasint *);
 double BLASFUNC(damax) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qamax) (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scamax)(blasint *, float  *, blasint *);
 double BLASFUNC(dzamax)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxamax)(blasint *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(samin) (blasint *, float  *, blasint *);
 double BLASFUNC(damin) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qamin) (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scamin)(blasint *, float  *, blasint *);
 double BLASFUNC(dzamin)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxamin)(blasint *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(smax)  (blasint *, float  *, blasint *);
 double BLASFUNC(dmax)  (blasint *, double *, blasint *);
 xdouble BLASFUNC(qmax)  (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scmax) (blasint *, float  *, blasint *);
 double BLASFUNC(dzmax) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qxmax) (blasint *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(smin)  (blasint *, float  *, blasint *);
 double BLASFUNC(dmin)  (blasint *, double *, blasint *);
 xdouble BLASFUNC(qmin)  (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scmin) (blasint *, float  *, blasint *);
 double BLASFUNC(dzmin) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qxmin) (blasint *, xdouble *, blasint *);

 void    BLASFUNC(sscal) (blasint *,  float  *, float  *, blasint *);
 void    BLASFUNC(dscal) (blasint *,  double *, double *, blasint *);
 void    BLASFUNC(qscal) (blasint *,  xdouble *, xdouble *, blasint *);
 void    BLASFUNC(cscal) (blasint *,  float  *, float  *, blasint *);
 void    BLASFUNC(zscal) (blasint *,  double *, double *, blasint *);
 void    BLASFUNC(xscal) (blasint *,  xdouble *, xdouble *, blasint *);
 void    BLASFUNC(csscal)(blasint *,  float  *, float  *, blasint *);
 void    BLASFUNC(zdscal)(blasint *,  double *, double *, blasint *);
 void    BLASFUNC(xqscal)(blasint *,  xdouble *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(snrm2) (blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(scnrm2)(blasint *, float  *, blasint *);

 double BLASFUNC(dnrm2) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qnrm2) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dznrm2)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxnrm2)(blasint *, xdouble *, blasint *);

 void  BLASFUNC(srot)  (blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *);
 void  BLASFUNC(drot)  (blasint *, double *, blasint *, double *, blasint *, double *, double *);
 void  BLASFUNC(qrot)  (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *);
 void  BLASFUNC(csrot) (blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *);
 void  BLASFUNC(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *);
 void  BLASFUNC(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *);

 void  BLASFUNC(srotg) (float  *, float  *, float  *, float  *);
 void  BLASFUNC(drotg) (double *, double *, double *, double *);
 void  BLASFUNC(qrotg) (xdouble *, xdouble *, xdouble *, xdouble *);
 void  BLASFUNC(crotg) (float  *, float  *, float  *, float  *);
 void  BLASFUNC(zrotg) (double *, double *, double *, double *);
 void  BLASFUNC(xrotg) (xdouble *, xdouble *, xdouble *, xdouble *);

 void  BLASFUNC(srotmg)(float  *, float  *, float  *, float  *, float  *);
 void  BLASFUNC(drotmg)(double *, double *, double *, double *, double *);

 void  BLASFUNC(srotm) (blasint *, float  *, blasint *, float  *, blasint *, float  *);
 void  BLASFUNC(drotm) (blasint *, double *, blasint *, double *, blasint *, double *);
 void  BLASFUNC(qrotm) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *);

 /* Level 2 routines */

 void BLASFUNC(sger)(blasint *,    blasint *, float *,  float *, blasint *,
 		   float *,  blasint *, float *,  blasint *);
 void BLASFUNC(dger)(blasint *,    blasint *, double *, double *, blasint *,
 		   double *, blasint *, double *, blasint *);
 void BLASFUNC(qger)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		   xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(cgeru)(blasint *,    blasint *, float *,  float *, blasint *,
 		    float *,  blasint *, float *,  blasint *);
 void BLASFUNC(cgerc)(blasint *,    blasint *, float *,  float *, blasint *,
 		    float *,  blasint *, float *,  blasint *);
 void BLASFUNC(zgeru)(blasint *,    blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, blasint *);
 void BLASFUNC(zgerc)(blasint *,    blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, blasint *);
 void BLASFUNC(xgeru)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(xgerc)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, blasint *);

 void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float  *, bfloat16 *, blasint *,
            bfloat16  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(sgemv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cgemv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(strsv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);
 void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);

 void BLASFUNC(strmv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);
 void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);

 void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dtpsv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(ctpsv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(ztpsv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(stpmv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dtpmv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(ctpmv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(ztpmv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(stbmv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(dtbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(qtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctbmv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(ztbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

 void BLASFUNC(stbsv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(dtbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(qtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

 void BLASFUNC(ssymv) (char *, blasint *, float  *, float *, blasint *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(dsymv) (char *, blasint *, double  *, double *, blasint *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsymv) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csymv) (char *, blasint *, float  *, float *, blasint *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zsymv) (char *, blasint *, double  *, double *, blasint *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsymv) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(sspmv) (char *, blasint *, float  *, float *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(dspmv) (char *, blasint *, double  *, double *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qspmv) (char *, blasint *, xdouble  *, xdouble *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cspmv) (char *, blasint *, float  *, float *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zspmv) (char *, blasint *, double  *, double *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xspmv) (char *, blasint *, xdouble  *, xdouble *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(ssyr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *, blasint *);
 void BLASFUNC(dsyr) (char *, blasint *, double  *, double *, blasint *,
 		    double *, blasint *);
 void BLASFUNC(qsyr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *, blasint *);
 void BLASFUNC(csyr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *, blasint *);
 void BLASFUNC(zsyr) (char *, blasint *, double  *, double *, blasint *,
 		    double *, blasint *);
 void BLASFUNC(xsyr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *, blasint *);

 void BLASFUNC(ssyr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(dsyr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(qsyr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(csyr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(zsyr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xsyr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

 void BLASFUNC(sspr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *);
 void BLASFUNC(dspr) (char *, blasint *, double  *, double *, blasint *,
 		    double *);
 void BLASFUNC(qspr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *);
 void BLASFUNC(cspr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *);
 void BLASFUNC(zspr) (char *, blasint *, double  *, double *, blasint *,
 		    double *);
 void BLASFUNC(xspr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *);

 void BLASFUNC(sspr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *);
 void BLASFUNC(dspr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *);
 void BLASFUNC(qspr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *);
 void BLASFUNC(cspr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *);
 void BLASFUNC(zspr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *);
 void BLASFUNC(xspr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *);

 void BLASFUNC(cher) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *, blasint *);
 void BLASFUNC(zher) (char *, blasint *, double  *, double *, blasint *,
 		    double *, blasint *);
 void BLASFUNC(xher) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *, blasint *);

 void BLASFUNC(chpr) (char *, blasint *, float   *, float  *, blasint *, float  *);
 void BLASFUNC(zhpr) (char *, blasint *, double  *, double *, blasint *, double *);
 void BLASFUNC(xhpr) (char *, blasint *, xdouble  *, xdouble *, blasint *, xdouble *);

 void BLASFUNC(cher2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(zher2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xher2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

 void BLASFUNC(chpr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *);
 void BLASFUNC(zhpr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *);
 void BLASFUNC(xhpr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *);

 void BLASFUNC(chemv) (char *, blasint *, float  *, float *, blasint *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zhemv) (char *, blasint *, double  *, double *, blasint *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhemv) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(chpmv) (char *, blasint *, float  *, float *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zhpmv) (char *, blasint *, double  *, double *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhpmv) (char *, blasint *, xdouble  *, xdouble *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);

 int BLASFUNC(snorm)(char *, blasint *, blasint *, float  *, blasint *);
 int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *);
 int BLASFUNC(cnorm)(char *, blasint *, blasint *, float  *, blasint *);
 int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *);

 void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csbmv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(chbmv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 /* Level 3 routines */

 void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   hfloat16  *, blasint *, hfloat16 *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
 void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(cgemm3m)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);

 int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
 int BLASFUNC(dge2mm)(char *, char *, char *, blasint *, blasint *,
 		     double *, double  *, blasint *, double  *, blasint *,
 		     double *, double  *, blasint *);
 int BLASFUNC(cge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
 int BLASFUNC(zge2mm)(char *, char *, char *, blasint *, blasint *,
 		     double *, double  *, blasint *, double  *, blasint *,
 		     double *, double  *, blasint *);

 void BLASFUNC(strsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(dtrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(qtrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(ztrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(xtrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);

 void BLASFUNC(strmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(dtrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(qtrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(ztrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(xtrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);

 void BLASFUNC(ssymm)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csymm)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(csymm3m)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsymm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsymm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(ssyrk)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, float  *, blasint *);
 void BLASFUNC(dsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, double *, blasint *);
 void BLASFUNC(qsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, xdouble *, blasint *);
 void BLASFUNC(csyrk)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, float  *, blasint *);
 void BLASFUNC(zsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, double *, blasint *);
 void BLASFUNC(xsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, xdouble *, blasint *);

 void BLASFUNC(ssyr2k)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csyr2k)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(chemm)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zhemm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhemm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(chemm3m)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zhemm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhemm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 void BLASFUNC(cherk)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, float  *, blasint *);
 void BLASFUNC(zherk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, double *, blasint *);
 void BLASFUNC(xherk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, xdouble *, blasint *);

 void BLASFUNC(cher2k)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zher2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 void BLASFUNC(xher2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);

 int BLASFUNC(cher2m)(char *, char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 int BLASFUNC(zher2m)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 int BLASFUNC(xher2m)(char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);

 int BLASFUNC(sgemt)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *);
 int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *);
 int BLASFUNC(cgemt)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *);
 int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *);

 int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);
 int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);

 int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(dgems)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);
 int BLASFUNC(cgems)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(zgems)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);

 int BLASFUNC(sgemc)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 int BLASFUNC(dgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *);
 int BLASFUNC(qgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *,  xdouble *, xdouble *, blasint *);
 int BLASFUNC(cgemc)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *);
 int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);

 /* Lapack routines */

 int BLASFUNC(sgetf2)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
 int BLASFUNC(cgetf2)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(zgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(xgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);

 int BLASFUNC(sgetrf)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(dgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(qgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
 int BLASFUNC(cgetrf)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(zgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(xgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);

 int BLASFUNC(slaswp)(blasint *, float  *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(dlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(qlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(claswp)(blasint *, float  *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(zlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(xlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *);

 int BLASFUNC(sgetrs)(char *, blasint *, blasint *, float  *, blasint *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cgetrs)(char *, blasint *, blasint *, float  *, blasint *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(sgesv)(blasint *, blasint *, float  *, blasint *, blasint *, float *, blasint *, blasint *);
 int BLASFUNC(dgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
 int BLASFUNC(qgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);
 int BLASFUNC(cgesv)(blasint *, blasint *, float  *, blasint *, blasint *, float *, blasint *, blasint *);
 int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
 int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);

 int BLASFUNC(spotf2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotf2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zpotf2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(spotrf)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dpotrf)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotrf)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(spotri)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotri)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(spotrs)(char *, blasint *, blasint *, float   *, blasint *, float   *, blasint *, blasint *);
 int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double  *, blasint *, double  *, blasint *, blasint *);
 int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float   *, blasint *, float   *, blasint *, blasint *);
 int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double  *, blasint *, double  *, blasint *, blasint *);
 int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(slauu2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(clauu2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zlauu2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(slauum)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dlauum)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qlauum)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(clauum)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zlauum)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xlauum)(char *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(strti2)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dtrti2)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(ctrti2)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(ztrti2)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *);

 int BLASFUNC(strtri)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dtrtri)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(ctrtri)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(ztrtri)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *);


 FLOATRET  BLASFUNC(slamch)(char *);
 double    BLASFUNC(dlamch)(char *);
 xdouble   BLASFUNC(qlamch)(char *);

 FLOATRET  BLASFUNC(slamc3)(float *, float *);
 double    BLASFUNC(dlamc3)(double *, double *);
 xdouble   BLASFUNC(qlamc3)(xdouble *, xdouble *);

 /* BLAS extensions */

 void    BLASFUNC(saxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
 void    BLASFUNC(daxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
 void    BLASFUNC(caxpby) (blasint *, void  *, float  *, blasint *, void *, float  *, blasint *);
 void    BLASFUNC(zaxpby) (blasint *, void  *, double *, blasint *, void *, double  *, blasint *);

 void    BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);
 void    BLASFUNC(comatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zomatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);

 void    BLASFUNC(simatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, blasint *);
 void    BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, blasint *);
 void    BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, blasint *);
 void    BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, blasint *);

 void    BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); 
 void    BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); 
 void    BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); 
 void    BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); 


 #ifdef __cplusplus
 }

 #endif  /* __cplusplus */

 #endif
 #endif
--- a/install/include/lapack.h
+++ b/install/include/lapack.h
--- a/install/include/lapacke.h
+++ b/install/include/lapacke.h
--- a/install/include/lapacke_config.h
+++ b/install/include/lapacke_config.h
@@ -1,159 +0,0 @@
 /*****************************************************************************
  Copyright (c) 2010, Intel Corp.
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 * Contents: Native C interface to LAPACK
 * Author: Intel Corporation
 *****************************************************************************/

 #ifndef _LAPACKE_CONFIG_H_
 #define _LAPACKE_CONFIG_H_

 #ifdef __cplusplus
 #if defined(LAPACK_COMPLEX_CPP)
 #include <complex>
 #endif
 extern "C" {
 #endif /* __cplusplus */

 #include <stdlib.h>
 #include <stdint.h>
 #include <inttypes.h>

 #ifndef lapack_int
 #if defined(LAPACK_ILP64)
 #define lapack_int        int64_t
 #else
 #define lapack_int        int32_t
 #endif
 #endif

 /*
 * Integer format string
 */
 #ifndef LAPACK_IFMT
 #if defined(LAPACK_ILP64)
 #define LAPACK_IFMT       PRId64
 #else
 #define LAPACK_IFMT       PRId32
 #endif
 #endif

 #ifndef lapack_logical
 #define lapack_logical    lapack_int
 #endif

 #if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER)
 #define LAPACK_COMPLEX_STRUCTURE
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname
 #define NOCHANGE
 #endif

 #ifndef LAPACK_COMPLEX_CUSTOM
 #if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER)
 #if defined(LAPACK_COMPLEX_CPP)
    #include <complex>
    #define lapack_complex_float std::complex<float>
    #define lapack_complex_double std::complex<double>
    #define lapack_complex_float_real(z)       ((z).real())
    #define lapack_complex_float_imag(z)       ((z).imag())
    #define lapack_complex_double_real(z)       ((z).real())
    #define lapack_complex_double_imag(z)       ((z).imag())
    #define _CRT_USE_C_COMPLEX_H
 #else
    #include <complex.h>
    #define LAPACK_COMPLEX_CUSTOM
    #define lapack_complex_float _Fcomplex
    #define lapack_complex_double _Dcomplex
    #define lapack_complex_float_real(z)       (creal(z))
    #define lapack_complex_float_imag(z)       (cimag(z))
    #define lapack_complex_double_real(z)       (creal(z))
    #define lapack_complex_double_imag(z)       (cimag(z))
 #endif
 #else

 #if defined(LAPACK_COMPLEX_STRUCTURE)

 typedef struct { float real, imag; } _lapack_complex_float;
 typedef struct { double real, imag; } _lapack_complex_double;
 #define lapack_complex_float  _lapack_complex_float
 #define lapack_complex_double _lapack_complex_double
 #define lapack_complex_float_real(z)  ((z).real)
 #define lapack_complex_float_imag(z)  ((z).imag)
 #define lapack_complex_double_real(z)  ((z).real)
 #define lapack_complex_double_imag(z)  ((z).imag)

 #elif defined(LAPACK_COMPLEX_C99)

 #include <complex.h>
 #define lapack_complex_float    float _Complex
 #define lapack_complex_double   double _Complex
 #define lapack_complex_float_real(z)       (creal(z))
 #define lapack_complex_float_imag(z)       (cimag(z))
 #define lapack_complex_double_real(z)       (creal(z))
 #define lapack_complex_double_imag(z)       (cimag(z))

 #elif defined(LAPACK_COMPLEX_CPP)

 #define lapack_complex_float std::complex<float>
 #define lapack_complex_double std::complex<double>
 #define lapack_complex_float_real(z)       ((z).real())
 #define lapack_complex_float_imag(z)       ((z).imag())
 #define lapack_complex_double_real(z)       ((z).real())
 #define lapack_complex_double_imag(z)       ((z).imag())

 #else

 #include <complex.h>
 #define lapack_complex_float    float _Complex
 #define lapack_complex_double   double _Complex
 #define lapack_complex_float_real(z)       (creal(z))
 #define lapack_complex_float_imag(z)       (cimag(z))
 #define lapack_complex_double_real(z)       (creal(z))
 #define lapack_complex_double_imag(z)       (cimag(z))

 #endif
 #endif

 lapack_complex_float lapack_make_complex_float( float re, float im );
 lapack_complex_double lapack_make_complex_double( double re, double im );

 #endif

 #ifndef LAPACK_malloc
 #define LAPACK_malloc( size )   malloc( size )
 #endif

 #ifndef LAPACK_free
 #define LAPACK_free( p )        free( p )
 #endif

 #ifdef __cplusplus
 }
 #endif /* __cplusplus */

 #endif /* _LAPACKE_CONFIG_H_ */
--- a/install/include/lapacke_mangling.h
+++ b/install/include/lapacke_mangling.h
@@ -1,17 +0,0 @@
 #ifndef LAPACK_HEADER_INCLUDED
 #define LAPACK_HEADER_INCLUDED

 #ifndef LAPACK_GLOBAL
 #if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
 #elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
 #define LAPACK_GLOBAL(lcname,UCNAME)  UCNAME
 #elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname
 #else
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
 #endif
 #endif

 #endif

--- a/install/include/lapacke_utils.h
+++ b/install/include/lapacke_utils.h
@@ -1,612 +0,0 @@
 /*****************************************************************************
  Copyright (c) 2014, Intel Corp.
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 * Contents: Native C interface to LAPACK utility functions
 * Author: Intel Corporation
 *****************************************************************************/

 #ifndef _LAPACKE_UTILS_H_
 #define _LAPACKE_UTILS_H_

 #include "lapacke.h"

 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */

 #ifndef ABS
 #define ABS(x) (((x) < 0) ? -(x) : (x))
 #endif
 #ifndef MAX
 #define MAX(x,y) (((x) > (y)) ? (x) : (y))
 #endif
 #ifndef MIN
 #define MIN(x,y) (((x) < (y)) ? (x) : (y))
 #endif
 #ifndef MAX3
 #define MAX3(x,y,z) (((x) > MAX(y,z)) ? (x) : MAX(y,z))
 #endif
 #ifndef MIN3
 #define MIN3(x,y,z) (((x) < MIN(y,z)) ? (x) : MIN(y,z))
 #endif

 #define IS_S_NONZERO(x) ( (x) < 0 || (x) > 0 )
 #define IS_D_NONZERO(x) ( (x) < 0 || (x) > 0 )
 #define IS_C_NONZERO(x) ( IS_S_NONZERO(*((float*)&x)) ||  \
                          IS_S_NONZERO(*(((float*)&x)+1)) )
 #define IS_Z_NONZERO(x) ( IS_D_NONZERO(*((double*)&x)) || \
                          IS_D_NONZERO(*(((double*)&x)+1)) )

 /* Error handler */
 void LAPACKE_xerbla( const char *name, lapack_int info );

 /* Compare two chars (case-insensitive) */
 lapack_logical LAPACKE_lsame( char ca,  char cb )
 #if defined __GNUC__
  __attribute__((const))
 #endif
 	;

 /* Functions to convert column-major to row-major 2d arrays and vice versa. */
 void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_float* in, lapack_int ldin,
                        lapack_complex_float* out, lapack_int ldout );
 void LAPACKE_cgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_float* in, lapack_int ldin,
                        lapack_complex_float* out, lapack_int ldout );
 void LAPACKE_chb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_che_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_chp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_chs_trans( int matrix_layout, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cpb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cpf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_cpo_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cpp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_csp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_csy_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_ctb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_ctf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );

 void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const double* in, lapack_int ldin,
                        double* out, lapack_int ldout );
 void LAPACKE_dgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const double* in, lapack_int ldin,
                        double* out, lapack_int ldout );
 void LAPACKE_dhs_trans( int matrix_layout, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dpb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dpf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const double *in,
                        double *out );
 void LAPACKE_dpo_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dpp_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in,
                        double *out );
 void LAPACKE_dsb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dsp_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in,
                        double *out );
 void LAPACKE_dsy_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dtb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dtf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const double *in,
                        double *out );
 void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const double *in,
                        double *out );
 void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );

 void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_sge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const float* in, lapack_int ldin,
                        float* out, lapack_int ldout );
 void LAPACKE_sgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const float* in, lapack_int ldin,
                        float* out, lapack_int ldout );
 void LAPACKE_shs_trans( int matrix_layout, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_spb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_spf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const float *in,
                        float *out );
 void LAPACKE_spo_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_spp_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in,
                        float *out );
 void LAPACKE_ssb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_ssp_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in,
                        float *out );
 void LAPACKE_ssy_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_stb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_stf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const float *in,
                        float *out );
 void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const float *in,
                        float *out );
 void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );

 void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_double* in, lapack_int ldin,
                        lapack_complex_double* out, lapack_int ldout );
 void LAPACKE_zgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_double* in, lapack_int ldin,
                        lapack_complex_double* out, lapack_int ldout );
 void LAPACKE_zhb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zhe_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zhp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zhs_trans( int matrix_layout, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zpb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zpf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zpo_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zpp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zsp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zsy_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_ztb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_ztf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );

 /* NaN checkers */
 #define LAPACK_SISNAN( x ) ( x != x )
 #define LAPACK_DISNAN( x ) ( x != x )
 #define LAPACK_CISNAN( x ) ( LAPACK_SISNAN(*((float*) &x)) || \
                              LAPACK_SISNAN(*(((float*) &x)+1)) )
 #define LAPACK_ZISNAN( x ) ( LAPACK_DISNAN(*((double*)&x)) || \
                              LAPACK_DISNAN(*(((double*)&x)+1)) )

 /* NaN checkers for vectors */
 lapack_logical LAPACKE_c_nancheck( lapack_int n,
                                    const lapack_complex_float *x,
                                    lapack_int incx );
 lapack_logical LAPACKE_d_nancheck( lapack_int n,
                                    const double *x,
                                    lapack_int incx );
 lapack_logical LAPACKE_s_nancheck( lapack_int n,
                                    const float *x,
                                    lapack_int incx );
 lapack_logical LAPACKE_z_nancheck( lapack_int n,
                                    const lapack_complex_double *x,
                                    lapack_int incx );
 /* NaN checkers for matrices */
 lapack_logical LAPACKE_cgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const lapack_complex_float *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_cge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cgt_nancheck( lapack_int n,
                                      const lapack_complex_float *dl,
                                      const lapack_complex_float *d,
                                      const lapack_complex_float *du );
 lapack_logical LAPACKE_chb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_che_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_chp_nancheck( lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_chs_nancheck( int matrix_layout, lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cpb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_cpf_nancheck( lapack_int n,
                                      const lapack_complex_float *a );
 lapack_logical LAPACKE_cpo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cpp_nancheck( lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_cpt_nancheck( lapack_int n,
                                      const float *d,
                                      const lapack_complex_float *e );
 lapack_logical LAPACKE_csp_nancheck( lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_cst_nancheck( lapack_int n,
                                      const lapack_complex_float *d,
                                      const lapack_complex_float *e );
 lapack_logical LAPACKE_csy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ctb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_ctf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_float *a );
 lapack_logical LAPACKE_ctp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const lapack_complex_float *a,
                                     lapack_int lda );

 lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const double *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dgt_nancheck( lapack_int n,
                                      const double *dl,
                                      const double *d,
                                      const double *du );
 lapack_logical LAPACKE_dhs_nancheck( int matrix_layout, lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dpb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dpf_nancheck( lapack_int n,
                                      const double *a );
 lapack_logical LAPACKE_dpo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dpp_nancheck( lapack_int n,
                                      const double *ap );
 lapack_logical LAPACKE_dpt_nancheck( lapack_int n,
                                      const double *d,
                                      const double *e );
 lapack_logical LAPACKE_dsb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dsp_nancheck( lapack_int n,
                                      const double *ap );
 lapack_logical LAPACKE_dst_nancheck( lapack_int n,
                                      const double *d,
                                      const double *e );
 lapack_logical LAPACKE_dsy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dtb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dtf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const double *a );
 lapack_logical LAPACKE_dtp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const double *ap );
 lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const double *a, lapack_int lda );

 lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const float *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_sge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_sgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_sgt_nancheck( lapack_int n,
                                      const float *dl,
                                      const float *d,
                                      const float *du );
 lapack_logical LAPACKE_shs_nancheck( int matrix_layout, lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_spb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_spf_nancheck( lapack_int n,
                                      const float *a );
 lapack_logical LAPACKE_spo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_spp_nancheck( lapack_int n,
                                      const float *ap );
 lapack_logical LAPACKE_spt_nancheck( lapack_int n,
                                      const float *d,
                                      const float *e );
 lapack_logical LAPACKE_ssb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_ssp_nancheck( lapack_int n,
                                      const float *ap );
 lapack_logical LAPACKE_sst_nancheck( lapack_int n,
                                      const float *d,
                                      const float *e );
 lapack_logical LAPACKE_ssy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_stb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_stf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const float *a );
 lapack_logical LAPACKE_stp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const float *ap );
 lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const float *a, lapack_int lda );

 lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const lapack_complex_double *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_zge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zgt_nancheck( lapack_int n,
                                      const lapack_complex_double *dl,
                                      const lapack_complex_double *d,
                                      const lapack_complex_double *du );
 lapack_logical LAPACKE_zhb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_zhe_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zhp_nancheck( lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_zhs_nancheck( int matrix_layout, lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zpb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_zpf_nancheck( lapack_int n,
                                      const lapack_complex_double *a );
 lapack_logical LAPACKE_zpo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zpp_nancheck( lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_zpt_nancheck( lapack_int n,
                                      const double *d,
                                      const lapack_complex_double *e );
 lapack_logical LAPACKE_zsp_nancheck( lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_zst_nancheck( lapack_int n,
                                      const lapack_complex_double *d,
                                      const lapack_complex_double *e );
 lapack_logical LAPACKE_zsy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ztb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_ztf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_double *a );
 lapack_logical LAPACKE_ztp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const lapack_complex_double *a,
                                     lapack_int lda );

 #ifdef __cplusplus
 }
 #endif /* __cplusplus */

 #endif  /* _LAPACKE_UTILS_H_ */
--- a/install/include/openblas_config.h
+++ b/install/include/openblas_config.h
@@ -1,136 +0,0 @@
 #ifndef OPENBLAS_CONFIG_H
 #define OPENBLAS_CONFIG_H
 #define OPENBLAS_OS_LINUX 1
 #define OPENBLAS_ARCH_RISCV64 1
 #define OPENBLAS_C_GCC 1
 #define OPENBLAS___64BIT__ 1
 #define OPENBLAS_HAVE_C11 1
 #define OPENBLAS_PTHREAD_CREATE_FUNC pthread_create
 #define OPENBLAS_BUNDERSCORE _
 #define OPENBLAS_NEEDBUNDERSCORE 1
 #define OPENBLAS_RISCV64_ZVL128B 
 #define OPENBLAS_L1_DATA_SIZE 32768
 #define OPENBLAS_L1_DATA_LINESIZE 32
 #define OPENBLAS_L2_SIZE 1048576
 #define OPENBLAS_L2_LINESIZE 32
 #define OPENBLAS_DTB_DEFAULT_ENTRIES 128
 #define OPENBLAS_DTB_SIZE 4096
 #define OPENBLAS_L2_ASSOCIATIVE 4
 #define OPENBLAS_CORE_RISCV64_ZVL128B 
 #define OPENBLAS_CHAR_CORENAME "RISCV64_ZVL128B"
 #define OPENBLAS_GEMM_MULTITHREAD_THRESHOLD 4
 #define OPENBLAS_VERSION " OpenBLAS 0.3.29.dev "
 /*This is only for "make install" target.*/

 #if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX)
 #define OPENBLAS_WINDOWS_ABI
 #define OPENBLAS_OS_WINDOWS

 #ifdef DOUBLE
 #define DOUBLE_DEFINED DOUBLE
 #undef  DOUBLE
 #endif
 #endif

 #ifdef OPENBLAS_NEEDBUNDERSCORE
 #define BLASFUNC(FUNC) FUNC##_
 #else
 #define BLASFUNC(FUNC) FUNC
 #endif

 #ifdef OPENBLAS_QUAD_PRECISION
 typedef struct {
  unsigned long x[2];
 }  xdouble;
 #elif defined OPENBLAS_EXPRECISION
 #define xdouble long double
 #else
 #define xdouble double
 #endif

 #if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__)
 typedef long long BLASLONG;
 typedef unsigned long long BLASULONG;
 #else
 typedef long BLASLONG;
 typedef unsigned long BLASULONG;
 #endif

 #ifndef BFLOAT16
 #include <stdint.h>
 typedef uint16_t bfloat16;
 #endif

 #if defined(__GNUC__) && (__GNUC__ >= 12)
 typedef _Float16 hfloat16;
 #else
 #include <stdint.h>
 typedef uint16_t hfloat16;
 #endif

 #ifdef OPENBLAS_USE64BITINT
 typedef BLASLONG blasint;
 #else
 typedef int blasint;
 #endif

 #if defined(XDOUBLE) || defined(DOUBLE)
 #define FLOATRET	FLOAT
 #else
 #ifdef NEED_F2CCONV
 #define FLOATRET	double
 #else
 #define FLOATRET	float
 #endif
 #endif

 /* Inclusion of a standard header file is needed for definition of __STDC_*
   predefined macros with some compilers (e.g. GCC 4.7 on Linux).  This occurs
   as a side effect of including either <features.h> or <stdc-predef.h>. */
 #include <stdio.h>

 /* C99 supports complex floating numbers natively, which GCC also offers as an
   extension since version 3.0.  If neither are available, use a compatible
   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
 #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
      (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER)
  #define OPENBLAS_COMPLEX_C99
 #ifndef __cplusplus
  #include <complex.h>
 #endif
  typedef float _Complex openblas_complex_float;
  typedef double _Complex openblas_complex_double;
  typedef xdouble _Complex openblas_complex_xdouble;
  #define openblas_make_complex_float(real, imag)    ((real) + ((imag) * _Complex_I))
  #define openblas_make_complex_double(real, imag)   ((real) + ((imag) * _Complex_I))
  #define openblas_make_complex_xdouble(real, imag)  ((real) + ((imag) * _Complex_I))
  #define openblas_complex_float_real(z)             (creal(z))
  #define openblas_complex_float_imag(z)             (cimag(z))
  #define openblas_complex_double_real(z)            (creal(z))
  #define openblas_complex_double_imag(z)            (cimag(z))
  #define openblas_complex_xdouble_real(z)           (creal(z))
  #define openblas_complex_xdouble_imag(z)           (cimag(z))
 #else
  #define OPENBLAS_COMPLEX_STRUCT
  typedef struct { float real, imag; } openblas_complex_float;
  typedef struct { double real, imag; } openblas_complex_double;
  typedef struct { xdouble real, imag; } openblas_complex_xdouble;
  #define openblas_make_complex_float(real, imag)    {(real), (imag)}
  #define openblas_make_complex_double(real, imag)   {(real), (imag)}
  #define openblas_make_complex_xdouble(real, imag)  {(real), (imag)}
  #define openblas_complex_float_real(z)             ((z).real)
  #define openblas_complex_float_imag(z)             ((z).imag)
  #define openblas_complex_double_real(z)            ((z).real)
  #define openblas_complex_double_imag(z)            ((z).imag)
  #define openblas_complex_xdouble_real(z)           ((z).real)
  #define openblas_complex_xdouble_imag(z)           ((z).imag)
 #endif

 /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */
 #ifdef OPENBLAS_OS_LINUX
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include <sched.h>
 #endif
 #endif /* OPENBLAS_CONFIG_H */
--- a/install/lib/cmake/openblas/OpenBLASConfig.cmake
+++ b/install/lib/cmake/openblas/OpenBLASConfig.cmake
@@ -1,4 +0,0 @@
 SET(OpenBLAS_VERSION "0.3.29.dev")
 file(REAL_PATH "../../.." _OpenBLAS_ROOT_DIR BASE_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} )
 SET(OpenBLAS_INCLUDE_DIRS ${_OpenBLAS_ROOT_DIR}/include)
 SET(OpenBLAS_LIBRARIES ${_OpenBLAS_ROOT_DIR}/lib/libopenblas.so)
--- a/install/lib/cmake/openblas/OpenBLASConfigVersion.cmake
+++ b/install/lib/cmake/openblas/OpenBLASConfigVersion.cmake
@@ -1,9 +0,0 @@
 set (PACKAGE_VERSION "0.3.29.dev")
 if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
  set (PACKAGE_VERSION_COMPATIBLE FALSE)
 else ()
  set (PACKAGE_VERSION_COMPATIBLE TRUE)
  if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
    set (PACKAGE_VERSION_EXACT TRUE)
  endif ()
 endif ()
--- a/install/lib/pkgconfig/openblas.pc
+++ b/install/lib/pkgconfig/openblas.pc
@@ -1,16 +0,0 @@
 libdir=/home/da/OpenBLAS/install/lib
 libprefix=
 libnamesuffix=
 libsuffix=
 includedir=/home/da/OpenBLAS/install/include
 omp_opt=
 openblas_config= USE_64BITINT= DYNAMIC_ARCH= DYNAMIC_OLDER= NO_CBLAS= NO_LAPACK= NO_LAPACKE= NO_AFFINITY=1 USE_OPENMP= RISCV64_ZVL128B MAX_THREADS=32
 version=0.3.29.dev
 extralib=-lm -lpthread -lgfortran -lm -lpthread -lgfortran
 Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
 Libs: -L${libdir} -l${libprefix}openblas${libsuffix}${libnamesuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir} ${omp_opt}
--- a/install/test_shgemm
+++ b/install/test_shgemm
--- a/install/test_shgemm.c
+++ b/install/test_shgemm.c
@@ -1,45 +0,0 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <cblas.h>
 #include <riscv_vector.h>

 void print_matrix(float *C, int M, int N) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", C[i * N + j]);
        }
        printf("\n");
    }
 }

 int main() {
    const int M = 2, N = 2, K = 2;
    const float alpha = 1.0f;
    const float beta = 0.0f;

    // A[M x K], row-major
    hfloat16 A[4] = {1.0, 2.0,
                     3.0, 4.0};

    // B[K x N], row-major
    hfloat16 B[4] = {5.0, 6.0,
                     7.0, 8.0};

    // C[M x N], row-major
    float C[4] = {0};

    // Call OpenBLAS float16 GEMM
    cblas_shgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                 M, N, K,
                 alpha,
                 A, K,  // lda = K
                 B, N,  // ldb = N
                 beta,
                 C, N); // ldc = N

    printf("Result C = A*B:\n");
    print_matrix(C, M, N);
    return 0;
 }

--- a/install/zvl_test
+++ b/install/zvl_test
--- a/install/zvl_test.c
+++ b/install/zvl_test.c
@@ -1,22 +0,0 @@
 #include <riscv_vector.h>
 #include <stdio.h>
 #include <stdlib.h>

 int main(){
 	unsigned int gvl = __riscv_vsetvl_e32m2(8);
 	float *A = (float *)malloc(4 * 4 * sizeof(float));
 	for (int i =0;i<4*4;i++){
 		A[i]=i%10;
 	}
 	vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[0], gvl);
 	float tmp[8];
    	__riscv_vse32_v_f32m2(tmp, A0, gvl);

 	printf("A0 vector contents:\n");
 	    for (int i = 0; i < gvl; i++) {
 		printf("tmp[%d] = %.2f\n", i, tmp[i]);
 	    }

 	    free(A);
 	return 0;
 }
--- a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
+++ b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
@@ -4,18 +4,18 @@
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc)
 {
    BLASLONG gvl = 0;
    volatile BLASLONG m_top = 0;
    BLASLONG m_top = 0;
    BLASLONG n_top = 0;

    // -- MAIN PASS
    for (BLASLONG j=0; j<N/8; j+=1) {
        m_top = 0;
        BLASLONG gvl = __riscv_vsetvl_e16m1(16);// 设置向量长度为16
        BLASLONG gvl = __riscv_vsetvl_e16m1(16);

        for (BLASLONG i=0; i<M/16; i+=1) {
            BLASLONG ai=m_top*K;	// A矩阵的当前行索引
            BLASLONG bi=n_top*K;	// B矩阵的当前列索引
            // 加载B矩阵的8个元素
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	

            _Float16 B0 = B[bi+0];
            _Float16 B1 = B[bi+1];
            _Float16 B2 = B[bi+2];
@@ -26,10 +26,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            _Float16 B7 = B[bi+7];
            bi += 8;

 		    // 加载A矩阵的16个元素，并与B矩阵元素相乘
            vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
            ai += 16;
 		    // 执行乘法运算，并转换为32位浮点数进行累加
            
            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
            vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
            vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
@@ -38,7 +37,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            vfloat32m2_t result5 = __riscv_vfwmul_vf_f32m2( A0, B5, gvl);
            vfloat32m2_t result6 = __riscv_vfwmul_vf_f32m2( A0, B6, gvl);
            vfloat32m2_t result7 = __riscv_vfwmul_vf_f32m2( A0, B7, gvl);
 		    // 循环处理K维度的剩余部分
            
            for(BLASLONG k=1; k<K; k++) {
                B0 = B[bi+0];
                B1 = B[bi+1];
@@ -51,7 +50,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                bi += 8;
                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 16;
                // 执行乘法和累加运算
                
                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
                result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
@@ -61,7 +60,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
                result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
            }
 	        // 加载C矩阵的元素，并与计算结果相加
            
            BLASLONG ci=n_top*ldc+m_top;

            vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
@@ -72,7 +71,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            vfloat32m2_t c5 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
            vfloat32m2_t c6 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
            vfloat32m2_t c7 = __riscv_vle32_v_f32m2( &C[ci], gvl);
            	// 将C矩阵元素转换为32位单精度浮点数，并与计算结果相加
            
            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
@@ -98,9 +97,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


        // -- tails for main pass
 	    // 处理M维度的剩余部分（如果M不是16的倍数）
        
        if( M & 8 ) {
            gvl = __riscv_vsetvl_e16m1(8);
            gvl = __riscv_vsetvl_e16mf2(8);

            BLASLONG ai=m_top*K;
            BLASLONG bi=n_top*K;
@@ -185,7 +184,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


        if( M & 4 ) {
            gvl = __riscv_vsetvl_e16m1(4);
            gvl = __riscv_vsetvl_e16mf2(4);

            BLASLONG ai=m_top*K;
            BLASLONG bi=n_top*K;
@@ -276,10 +275,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        }

        if( M & 2 ) {

            BLASLONG ai = m_top * K;
            BLASLONG bi = n_top * K;

            float result0 = 0;
            float result1 = 0;
            float result2 = 0;
@@ -296,7 +291,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            float result13 = 0;
            float result14 = 0;
            float result15 = 0;

            BLASLONG ai = m_top * K;
            BLASLONG bi = n_top * K;
            
            for(BLASLONG k=0; k<K; k++) {
                result0+=(float)(A[ai+0]*B[bi+0]);
                result1+=(float)(A[ai+1]*B[bi+0]);
@@ -387,24 +384,24 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        m_top = 0;

        for (BLASLONG i=0; i<M/16; i+=1) {
            BLASLONG ai=m_top*K;	// A矩阵的当前行索引
            BLASLONG bi=n_top*K;	// B矩阵的当前列索引
            // 加载B矩阵的4个元素
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	
            
            _Float16 B0 = B[bi+0];
            _Float16 B1 = B[bi+1];
            _Float16 B2 = B[bi+2];
            _Float16 B3 = B[bi+3];
            bi += 4;

 		    // 加载A矩阵的16个元素，并与B矩阵元素相乘
            
            vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
            ai += 16;
 		    // 执行乘法运算，并转换为32位浮点数进行累加
            
            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
            vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
            vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
            vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
 		    // 循环处理K维度的剩余部分
            
            for(BLASLONG k=1; k<K; k++) {
                B0 = B[bi+0];
                B1 = B[bi+1];
@@ -414,20 +411,20 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,

                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 16;
                // 执行乘法和累加运算
                
                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
                result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
                result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
            }
 	        // 加载C矩阵的元素，并与计算结果相加
            
            BLASLONG ci=n_top*ldc+m_top;

            vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
            vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
            vfloat32m2_t c2 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
            vfloat32m2_t c3 = __riscv_vle32_v_f32m2( &C[ci], gvl);
            	// 将C矩阵元素转换为32位单精度浮点数，并与计算结果相加
            
            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
@@ -443,7 +440,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        }

        if( M & 8 ) {
            gvl = __riscv_vsetvl_e16m1(8);
            gvl = __riscv_vsetvl_e16mf2(8);
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	
            
@@ -500,7 +497,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        }

        if( M & 4 ) {
            gvl = __riscv_vsetvl_e16m1(4);
            gvl = __riscv_vsetvl_e16mf2(4);

            BLASLONG ai=m_top*K;
            BLASLONG bi=n_top*K;
@@ -560,10 +557,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


        if( M & 2 ) {

            BLASLONG ai = m_top * K;
            BLASLONG bi = n_top * K;

            float result0 = 0;
            float result1 = 0;
            float result2 = 0;
@@ -572,6 +565,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            float result5 = 0;
            float result6 = 0;
            float result7 = 0;
            BLASLONG ai = m_top * K;
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
                result0+=(float)(A[ai+0]*B[bi+0]);
@@ -639,20 +634,20 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        m_top = 0;

        for (BLASLONG i=0; i<M/16; i+=1) {
            BLASLONG ai=m_top*K;	// A矩阵的当前行索引
            BLASLONG bi=n_top*K;	// B矩阵的当前列索引
            // 加载B矩阵的4个元素
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	
            
            _Float16 B0 = B[bi+0];
            _Float16 B1 = B[bi+1];
            bi += 2;

 		    // 加载A矩阵的16个元素，并与B矩阵元素相乘
            
            vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
            ai += 16;
 		    // 执行乘法运算，并转换为32位浮点数进行累加
            
            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
            vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
 		    // 循环处理K维度的剩余部分
            
            for(BLASLONG k=1; k<K; k++) {
                B0 = B[bi+0];
                B1 = B[bi+1];
@@ -660,16 +655,15 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,

                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 16;
                // 执行乘法和累加运算

                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
            }
 	        // 加载C矩阵的元素，并与计算结果相加

            BLASLONG ci=n_top*ldc+m_top;

            vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
            vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl);
            	// 将C矩阵元素转换为32位单精度浮点数，并与计算结果相加
            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);

@@ -681,7 +675,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        }

        if( M & 8 ) {
            gvl = __riscv_vsetvl_e16m1(8);
            gvl = __riscv_vsetvl_e16mf2(8);
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	
            
@@ -724,7 +718,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        }

        if( M & 4 ) {
            gvl = __riscv_vsetvl_e16m1(4);
            gvl = __riscv_vsetvl_e16mf2(4);

            BLASLONG ai=m_top*K;
            BLASLONG bi=n_top*K;
@@ -826,32 +820,30 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        m_top = 0;

        for (BLASLONG i=0; i<M/16; i+=1) {
            BLASLONG ai=m_top*K;	// A矩阵的当前行索引
            BLASLONG bi=n_top*K;	// B矩阵的当前列索引
            // 加载B矩阵的4个元素
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	
            _Float16 B0 = B[bi+0];
            bi += 1;

 		    // 加载A矩阵的16个元素，并与B矩阵元素相乘
            vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
            ai += 16;
 		    // 执行乘法运算，并转换为32位浮点数进行累加

            vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
 		    // 循环处理K维度的剩余部分

            for(BLASLONG k=1; k<K; k++) {
                B0 = B[bi+0];
                bi += 1;

                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 16;
                // 执行乘法和累加运算
                
                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
            }
 	        // 加载C矩阵的元素，并与计算结果相加
            
            BLASLONG ci=n_top*ldc+m_top;

            vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl);
            	// 将C矩阵元素转换为32位单精度浮点数，并与计算结果相加
            
            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);

            ci=n_top*ldc+m_top;
@@ -861,7 +853,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        }

        if( M & 8 ) {
            gvl = __riscv_vsetvl_e16m1(8);
            gvl = __riscv_vsetvl_e16mf2(8);
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	
            
@@ -897,7 +889,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        }

        if( M & 4 ) {
            gvl = __riscv_vsetvl_e16m1(4);
            gvl = __riscv_vsetvl_e16mf2(4);

            BLASLONG ai=m_top*K;
            BLASLONG bi=n_top*K;
@@ -932,12 +924,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


        if( M & 2 ) {

            BLASLONG ai = m_top * K;
            BLASLONG bi = n_top * K;

            float result0 = 0;
            float result1 = 0;
            BLASLONG ai = m_top * K;
            BLASLONG bi = n_top * K;

            for(BLASLONG k=0; k<K; k++) {
                result0+=(float)(A[ai+0]*B[bi+0]);