From 89637f87c83156b5ae36992baf40fd50e12af050 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 12 Aug 2013 18:04:10 +0200 Subject: [PATCH 001/127] added sgemm- and dgemm-kernel for HASWELL processor --- kernel/x86_64/dgemm_kernel_16x2_haswell.S | 5386 +++++++++++++++++++++ kernel/x86_64/sgemm_kernel_16x4_haswell.S | 3150 ++++++++++++ 2 files changed, 8536 insertions(+) create mode 100644 kernel/x86_64/dgemm_kernel_16x2_haswell.S create mode 100644 kernel/x86_64/sgemm_kernel_16x4_haswell.S diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S new file mode 100644 index 000000000..67a7ed3f0 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -0,0 +1,5386 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x0,\x1,\x2 +.endm + +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + + + + +.macro KERNEL16x3_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + prefetcht0 A_PR1+64(AO,%rax,SIZE) + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + addq $12, BI + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $64, %rax + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm15, %ymm15 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 + vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) + vmovups %ymm15,12 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $4, BI + addq $4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $1, BI + addq $1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L6_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB + + jl .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L6_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB + + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L6_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB + + jl .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L6_36 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB + + jl .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L6_46 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB + + jl .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L7_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB + + jl .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L7_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+128(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB + + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L7_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB + + jl .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L7_36 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB + + jl .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L7_46 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB + + jl .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S new file mode 100644 index 000000000..1c8a37710 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -0,0 +1,3150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PS_ y0,y1,y2 + vfmaddps \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SS_ x0,x1,x2 + vfmaddss \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PS_ y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADD231SS_ x0,x1,x2 + vfmadd231sd \x0,\x1,\x2 +.endm + +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm5,%ymm2,%ymm1 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_ %ymm7,%ymm3,%ymm1 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm8,%ymm2,%ymm0 + VFMADD231PS_ %ymm9,%ymm2,%ymm1 + VFMADD231PS_ %ymm10,%ymm3,%ymm0 + VFMADD231PS_ %ymm11,%ymm3,%ymm1 + addq $4 , BI + addq $16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm8,%ymm2,%ymm0 + VFMADD231PS_ %ymm10,%ymm3,%ymm0 + addq $4 , BI + addq $8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_ %xmm4,%xmm2,%xmm0 + VFMADD231PS_ %xmm6,%xmm3,%xmm0 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_ %xmm8,%xmm2,%xmm0 + VFMADD231PS_ %xmm10,%xmm3,%xmm0 + addq $4 , BI + addq $4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm5,%xmm2,%xmm1 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_ %xmm7,%xmm3,%xmm1 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm8,%xmm2,%xmm0 + VFMADD231SS_ %xmm9,%xmm2,%xmm1 + VFMADD231SS_ %xmm10,%xmm3,%xmm0 + VFMADD231SS_ %xmm11,%xmm3,%xmm1 + addq $4 , BI + addq $2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddps (CO2), %xmm8,%xmm8 + vaddps 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddps (CO2, LDC), %xmm10,%xmm10 + vaddps 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm8,%xmm2,%xmm0 + VFMADD231SS_ %xmm10,%xmm3,%xmm0 + addq $4 , BI + addq $1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm5,%ymm2,%ymm1 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_ %ymm7,%ymm3,%ymm1 + addq $2 , BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + addq $2 , BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_ %xmm4,%xmm2,%xmm0 + VFMADD231PS_ %xmm6,%xmm3,%xmm0 + addq $2 , BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm5,%xmm2,%xmm1 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_ %xmm7,%xmm3,%xmm1 + addq $2 , BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + addq $2 , BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm5,%ymm2,%ymm1 + addq $1 , BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + addq $1 , BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_ %xmm4,%xmm2,%xmm0 + addq $1 , BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm5,%xmm2,%xmm1 + addq $1 , BI + addq $2, %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + addq $1 , BI + addq $1, %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovss %xmm12, OFFSET + vmovss %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $16*SIZE,BO1 + addq $16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + salq $2 ,BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + salq $4, BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + salq $2, BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + From 26383708442569894c5fd3f14cdf1689f3300568 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 13 Aug 2013 00:54:59 +0800 Subject: [PATCH 002/127] Init code base for Intel Haswell. --- Makefile.system | 4 +- common_x86.h | 5 ++ common_x86_64.h | 6 ++ cpuid.h | 5 +- cpuid_x86.c | 4 ++ driver/others/dynamic.c | 9 ++- getarch.c | 15 +++++ kernel/x86/KERNEL.HASWELL | 1 + kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.HASWELL | 84 +++++++++++++++++++++++++ kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 76 ++++++++++++++++++++++ 29 files changed, 220 insertions(+), 27 deletions(-) create mode 100644 kernel/x86/KERNEL.HASWELL create mode 100644 kernel/x86_64/KERNEL.HASWELL diff --git a/Makefile.system b/Makefile.system index 9663322fa..2134786c0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -324,14 +324,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL endif endif diff --git a/common_x86.h b/common_x86.h index 48517d900..49e6be29e 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif +#if defined(SANDYBRIDGE) || defined(HASWELL) +//Enable some optimazation for nehalem. +#define NEHALEM_OPTIMIZATION +#endif + #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION diff --git a/common_x86_64.h b/common_x86_64.h index 188903848..8e9d79443 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -218,6 +218,12 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER +#if defined(SANDYBRIDGE) || defined(HASWELL) +//Enable some optimazation for nehalem. +#define NEHALEM_OPTIMIZATION +#endif + + #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION diff --git a/cpuid.h b/cpuid.h index 2cbbd4539..cb4404cb0 100644 --- a/cpuid.h +++ b/cpuid.h @@ -107,7 +107,7 @@ #define CORE_BOBCAT 21 #define CORE_BULLDOZER 22 #define CORE_PILEDRIVER 23 -#define CORE_HASWELL CORE_SANDYBRIDGE +#define CORE_HASWELL 24 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -200,7 +200,6 @@ typedef struct { #define CPUTYPE_BOBCAT 45 #define CPUTYPE_BULLDOZER 46 #define CPUTYPE_PILEDRIVER 47 -// this define is because BLAS doesn't have haswell specific optimizations yet -#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE +#define CPUTYPE_HASWELL 48 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 9e850a2aa..4ed01b891 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1243,6 +1243,7 @@ static char *cpuname[] = { "BOBCAT", "BULLDOZER", "PILEDRIVER", + "HASWELL", }; static char *lowercpuname[] = { @@ -1293,6 +1294,7 @@ static char *lowercpuname[] = { "bobcat", "bulldozer", "piledriver", + "haswell", }; static char *corename[] = { @@ -1320,6 +1322,7 @@ static char *corename[] = { "BOBCAT", "BULLDOZER", "PILEDRIVER", + "HASWELL", }; static char *corename_lower[] = { @@ -1347,6 +1350,7 @@ static char *corename_lower[] = { "bobcat", "bulldozer", "piledriver", + "haswell", }; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 197cc2b2d..bc6c386ec 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_HASWELL; #else //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #endif -//Use sandy bridge kernels for haswell. -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE + #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -285,6 +286,7 @@ static char *corename[] = { "Bobcat", "Bulldozer", "Piledriver", + "Haswell", }; char *gotoblas_corename(void) { @@ -307,7 +309,8 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_BOBCAT) return corename[17]; if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; - if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_HASWELL) return corename[20]; return corename[0]; } diff --git a/getarch.c b/getarch.c index 3ffda6244..274d2a868 100644 --- a/getarch.c +++ b/getarch.c @@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "SANDYBRIDGE" #endif +#ifdef FORCE_HASWELL +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/x86/KERNEL.HASWELL b/kernel/x86/KERNEL.HASWELL new file mode 100644 index 000000000..65b03ae50 --- /dev/null +++ b/kernel/x86/KERNEL.HASWELL @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index ebd1377f1..b620f6d12 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 6fa7d410e..3823aa08b 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 9ce4cd8d4..e60444b08 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index a1a35a7a5..f19047d14 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index a5333640d..4a87539bf 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index c3619ec3d..79222a07a 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 53e53c3ce..0a646e980 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 3c056cdff..58d8b4d6d 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 1efa1fd25..1604da17f 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 849afed73..fce813eb4 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index c1833abe2..fda1a9860 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL new file mode 100644 index 000000000..c321be752 --- /dev/null +++ b/kernel/x86_64/KERNEL.HASWELL @@ -0,0 +1,84 @@ +SGEMMKERNEL = sgemm_kernel_8x8_sandy.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = dgemm_kernel_4x8_sandy.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +#DGEMMONCOPY = gemm_ncopy_4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +#DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMKERNEL = cgemm_kernel_4x8_sandy.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index f56490ee3..aad66b4ac 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index bb40ac41e..1b28bcd20 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 653a96ff8..12aab61ed 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index e26088c16..d044dfbe2 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 7f3b54ddb..f818170a3 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 852a81820..7b6c5976d 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index c79e7f1a2..6dabf0735 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 6d0afce18..df64d8045 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 0c3df6951..e4b3871b1 100644 --- a/param.h +++ b/param.h @@ -1150,6 +1150,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +//#define SGEMM_DEFAULT_R 1024 + +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +//#define DGEMM_DEFAULT_R 1024 + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 128 +//#define CGEMM_DEFAULT_R cgemm_r +#define CGEMM_DEFAULT_R 1024 + +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +//#define ZGEMM_DEFAULT_R 1024 + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + #ifdef ATOM From 1cb9579cd08e53e40ade54a77982c9d33310fc1e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 14 Aug 2013 01:23:15 +0200 Subject: [PATCH 003/127] added zgemm_kernel_4x2_haswell.S and fixed a bug in sgemm_kernel_16x4_haswell.S --- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 23 +- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 2021 +++++++++++++++++++++ 2 files changed, 2033 insertions(+), 11 deletions(-) create mode 100644 kernel/x86_64/zgemm_kernel_4x2_haswell.S diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index 1c8a37710..5a6a9e690 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -131,11 +131,11 @@ #else .macro VFMADD231PS_ y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231ps \y0,\y1,\y2 .endm .macro VFMADD231SS_ x0,x1,x2 - vfmadd231sd \x0,\x1,\x2 + vfmadd231ss \x0,\x1,\x2 .endm #endif @@ -859,6 +859,7 @@ jz .L4_01b ALIGN_4 + .L4_01a: prefetcht0 512(BO1) prefetchw 512(BO) @@ -957,7 +958,7 @@ andq $-8, %rax // K = K - ( K % 8 ) je .L4_16 movq %rax, BI // Index for BO - salq $4, BI // BI = BI * 4 ; number of values + leaq (,BI,4) , BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO @@ -1026,7 +1027,7 @@ je .L4_19 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO @@ -1124,7 +1125,7 @@ andq $-8, %rax je .L4_20_6 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO @@ -1173,7 +1174,7 @@ je .L4_20_9 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO @@ -1264,7 +1265,7 @@ andq $-8, %rax je .L4_26 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO @@ -1313,7 +1314,7 @@ je .L4_29 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO @@ -1401,7 +1402,7 @@ andq $-8, %rax je .L4_36 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO @@ -1535,7 +1536,7 @@ andq $-8, %rax je .L4_46 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO @@ -1583,7 +1584,7 @@ je .L4_49 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S new file mode 100644 index 000000000..814d45179 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -0,0 +1,2021 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x0,\x1,\x2 +.endm + +#endif + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +.macro VFMADDPD_R y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +.macro VFMADDPD_R y0,y1,y2 + vfnmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +.macro VFMADDPD_R y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfnmaddpd \y0,\y1,\y2,\y0 +.endm + +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd + +#else + +.macro VFMADDPD_R y0,y1,y2 + vfnmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfnmaddpd \y0,\y1,\y2,\y0 +.endm + +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +.macro VFMADDPD_R y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +#define VFMADD_R vfmadd231pd +#define VFMADD_I vfmadd231pd + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +.macro VFMADDPD_R y0,y1,y2 + vfnmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +#define VFMADD_R vfnmadd231pd +#define VFMADD_I vfmadd231pd + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +.macro VFMADDPD_R y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfnmadd231pd \y0,\y1,\y2 +.endm + +#define VFMADD_R vfmadd231pd +#define VFMADD_I vfnmadd231pd + +#else + +.macro VFMADDPD_R y0,y1,y2 + vfnmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADDPD_I y0,y1,y2 + vfnmadd231pd \y0,\y1,\y2 +.endm + +#define VFMADD_R vfnmadd231pd +#define VFMADD_I vfnmadd231pd + +#endif + +#endif + +#define A_PR1 384 +#define B_PR1 192 +/***************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + + vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPD_R %ymm8 ,%ymm4,%ymm0 + VFMADDPD_R %ymm12,%ymm4,%ymm1 + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPD_I %ymm9 ,%ymm5,%ymm0 + VFMADDPD_I %ymm13,%ymm5,%ymm1 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPD_R %ymm10,%ymm6,%ymm0 + VFMADDPD_R %ymm14,%ymm6,%ymm1 + VFMADDPD_I %ymm11,%ymm7,%ymm0 + VFMADDPD_I %ymm15,%ymm7,%ymm1 + + addq $4, BI + addq $8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm11, %ymm11, %ymm11 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + vshufpd $0x05, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + vshufpd $0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $0x05, %ymm10, %ymm10, %ymm11 + vshufpd $0x05, %ymm12, %ymm12, %ymm13 + vshufpd $0x05, %ymm14, %ymm14, %ymm15 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm11, %ymm11, %ymm11 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + vshufpd $0x05, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + + +.endm + +/***************************************************************************************************/ +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 + vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 + VFMADDPD_R %ymm8 ,%ymm4,%ymm0 + VFMADDPD_R %ymm12,%ymm4,%ymm1 + VFMADDPD_I %ymm9 ,%ymm5,%ymm0 + VFMADDPD_I %ymm13,%ymm5,%ymm1 + + addq $2, BI + addq $8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13,%ymm12 , %ymm12 + + vshufpd $0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $0x05, %ymm12, %ymm12, %ymm13 + +#else + vaddsubpd %ymm8, %ymm9 , %ymm9 + vaddsubpd %ymm12,%ymm13, %ymm13 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm13, %ymm12 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm13, %ymm1, %ymm13 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13, %ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 ,4 * SIZE(CO1) + +.endm + + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_00_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB(xxx) + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB(xxx) + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE From 2b8ab8f55b5ff2a3d3234af361ec9a994cabe33a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 14 Aug 2013 01:44:41 +0200 Subject: [PATCH 004/127] sgemm_kernel_16x4_haswell.S minor changes --- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index 5a6a9e690..922096115 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -930,7 +930,7 @@ leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO - salq $2 ,BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO @@ -1053,7 +1053,7 @@ movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO @@ -1096,7 +1096,7 @@ leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO @@ -1200,7 +1200,7 @@ movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO @@ -1236,7 +1236,7 @@ leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO @@ -1340,7 +1340,7 @@ movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO @@ -1373,7 +1373,7 @@ leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO @@ -1451,7 +1451,7 @@ je .L4_39 movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO @@ -1477,7 +1477,7 @@ movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO @@ -1509,7 +1509,7 @@ leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif @@ -1609,7 +1609,7 @@ movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO - salq $2, BI // BI = BI * 4 ; number of values + leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif From 0b90c0ec64bdced16551dcca7d77bcb15aa70b23 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 15 Aug 2013 18:46:14 +0200 Subject: [PATCH 005/127] added sgemm_kernel_16x4_haswell.S --- kernel/x86/sgemm_kernel_16x4_haswell.S | 3167 ++++++++++++++++++++++++ 1 file changed, 3167 insertions(+) create mode 100644 kernel/x86/sgemm_kernel_16x4_haswell.S diff --git a/kernel/x86/sgemm_kernel_16x4_haswell.S b/kernel/x86/sgemm_kernel_16x4_haswell.S new file mode 100644 index 000000000..9c0334b23 --- /dev/null +++ b/kernel/x86/sgemm_kernel_16x4_haswell.S @@ -0,0 +1,3167 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/08/15 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 168 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 2.22 times faster than sandybridge +* 4 threads: 2.26 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) || defined(PILEDRIVER) + +.macro VFMADD231PS_ y0,y1,y2 + vfmaddps \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SS_ x0,x1,x2 + vfmaddss \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PS_ y0,y1,y2 + vfmadd231ps \y0,\y1,\y2 +.endm + +.macro VFMADD231SS_ x0,x1,x2 + vfmadd231ss \x0,\x1,\x2 +.endm + +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm5,%ymm2,%ymm1 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_ %ymm7,%ymm3,%ymm1 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm8,%ymm2,%ymm0 + VFMADD231PS_ %ymm9,%ymm2,%ymm1 + VFMADD231PS_ %ymm10,%ymm3,%ymm0 + VFMADD231PS_ %ymm11,%ymm3,%ymm1 + addq $4 , BI + addq $16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm8,%ymm2,%ymm0 + VFMADD231PS_ %ymm10,%ymm3,%ymm0 + addq $4 , BI + addq $8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_ %xmm4,%xmm2,%xmm0 + VFMADD231PS_ %xmm6,%xmm3,%xmm0 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_ %xmm8,%xmm2,%xmm0 + VFMADD231PS_ %xmm10,%xmm3,%xmm0 + addq $4 , BI + addq $4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm5,%xmm2,%xmm1 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_ %xmm7,%xmm3,%xmm1 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm8,%xmm2,%xmm0 + VFMADD231SS_ %xmm9,%xmm2,%xmm1 + VFMADD231SS_ %xmm10,%xmm3,%xmm0 + VFMADD231SS_ %xmm11,%xmm3,%xmm1 + addq $4 , BI + addq $2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddps (CO2), %xmm8,%xmm8 + vaddps 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddps (CO2, LDC), %xmm10,%xmm10 + vaddps 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm8,%xmm2,%xmm0 + VFMADD231SS_ %xmm10,%xmm3,%xmm0 + addq $4 , BI + addq $1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm5,%ymm2,%ymm1 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_ %ymm7,%ymm3,%ymm1 + addq $2 , BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm6,%ymm3,%ymm0 + addq $2 , BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_ %xmm4,%xmm2,%xmm0 + VFMADD231PS_ %xmm6,%xmm3,%xmm0 + addq $2 , BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm5,%xmm2,%xmm1 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_ %xmm7,%xmm3,%xmm1 + addq $2 , BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm6,%xmm3,%xmm0 + addq $2 , BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_ %ymm5,%ymm2,%ymm1 + addq $1 , BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_ %ymm4,%ymm2,%ymm0 + addq $1 , BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_ %xmm4,%xmm2,%xmm0 + addq $1 , BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_ %xmm5,%xmm2,%xmm1 + addq $1 , BI + addq $2, %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_ %xmm4,%xmm2,%xmm0 + addq $1 , BI + addq $1, %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $16*SIZE,BO1 + addq $16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + From 4070d9a1234d36a3925196444d23e0bb5f4c9686 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 15 Aug 2013 19:17:20 +0200 Subject: [PATCH 006/127] added dgemm_kernel_16x2_haswell.S --- kernel/x86/dgemm_kernel_16x2_haswell.S | 5404 ++++++++++++++++++++++++ 1 file changed, 5404 insertions(+) create mode 100644 kernel/x86/dgemm_kernel_16x2_haswell.S diff --git a/kernel/x86/dgemm_kernel_16x2_haswell.S b/kernel/x86/dgemm_kernel_16x2_haswell.S new file mode 100644 index 000000000..27a604855 --- /dev/null +++ b/kernel/x86/dgemm_kernel_16x2_haswell.S @@ -0,0 +1,5404 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/08/15 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 2 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 384 +* SGEMM_DEFAULT_Q 168 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 2.31 times faster than sandybridge +* 4 threads: 2.26 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y0,\y1,\y2 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x0,\x1,\x2 +.endm + +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + + + + +.macro KERNEL16x3_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + prefetcht0 A_PR1+64(AO,%rax,SIZE) + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + addq $12, BI + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $64, %rax + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm15, %ymm15 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 + vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) + vmovups %ymm15,12 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $4, BI + addq $4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $1, BI + addq $1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L6_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB + + jl .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L6_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB + + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L6_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB + + jl .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L6_36 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB + + jl .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L6_46 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB + + jl .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L7_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x3_1 + KERNEL16x3_2 + KERNEL16x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL16x3_4 + + KERNEL16x3_1 + KERNEL16x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL16x3_3 + KERNEL16x3_4 + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB + + jl .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L7_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1 + KERNEL8x3_2 + KERNEL8x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4 + + KERNEL8x3_1 + KERNEL8x3_2 + prefetcht0 B_PR1+128(BO,BI,8) + KERNEL8x3_3 + KERNEL8x3_4 + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB + + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L7_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1 + KERNEL4x3_2 + KERNEL4x3_3 + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4 + + KERNEL4x3_1 + KERNEL4x3_2 + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3 + KERNEL4x3_4 + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB + + jl .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L7_36 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + KERNEL2x3_1 + KERNEL2x3_2 + KERNEL2x3_3 + KERNEL2x3_4 + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB + + jl .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L7_46 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + KERNEL1x3_1 + KERNEL1x3_2 + KERNEL1x3_3 + KERNEL1x3_4 + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB + + jl .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif From d488b1b1aa868bb4efec1fcd2c2fca81786d86b1 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 16 Aug 2013 10:29:47 +0200 Subject: [PATCH 007/127] added zgemm_kernel_4x2_haswell.S --- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 993 ++++++++++------------- 1 file changed, 410 insertions(+), 583 deletions(-) diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index 814d45179..d189b517b 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -36,6 +36,24 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/********************************************************************* +* 2013/08/16 Saar +* Parameter: +* ZGEMM_DEFAULT_UNROLL_N 2 +* ZGEMM_DEFAULT_UNROLL_M 4 +* ZGEMM_DEFAULT_P 112 +* ZGEMM_DEFAULT_Q 224 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 1.80 times faster than sandybridge +* 4 threads: 1.74 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ + #define ASSEMBLER #include "common.h" @@ -119,29 +137,8 @@ #define STACK_TOUCH #endif -#if defined(BULLDOZER) - -.macro VFMADD231PD_ y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmaddsd \x0,\x1,\x2,\x0 -.endm - -#else - -.macro VFMADD231PD_ y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmadd231sd \x0,\x1,\x2 -.endm - -#endif -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) @@ -153,9 +150,6 @@ vfmaddpd \y0,\y1,\y2,\y0 .endm -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd - #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) .macro VFMADDPD_R y0,y1,y2 @@ -166,9 +160,6 @@ vfmaddpd \y0,\y1,\y2,\y0 .endm -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd - #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) .macro VFMADDPD_R y0,y1,y2 @@ -179,9 +170,6 @@ vfnmaddpd \y0,\y1,\y2,\y0 .endm -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd - #else .macro VFMADDPD_R y0,y1,y2 @@ -192,9 +180,6 @@ vfnmaddpd \y0,\y1,\y2,\y0 .endm -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd - #endif #else @@ -209,9 +194,6 @@ vfmadd231pd \y0,\y1,\y2 .endm -#define VFMADD_R vfmadd231pd -#define VFMADD_I vfmadd231pd - #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) .macro VFMADDPD_R y0,y1,y2 @@ -222,9 +204,6 @@ vfmadd231pd \y0,\y1,\y2 .endm -#define VFMADD_R vfnmadd231pd -#define VFMADD_I vfmadd231pd - #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) .macro VFMADDPD_R y0,y1,y2 @@ -235,9 +214,6 @@ vfnmadd231pd \y0,\y1,\y2 .endm -#define VFMADD_R vfmadd231pd -#define VFMADD_I vfnmadd231pd - #else .macro VFMADDPD_R y0,y1,y2 @@ -248,9 +224,6 @@ vfnmadd231pd \y0,\y1,\y2 .endm -#define VFMADD_R vfnmadd231pd -#define VFMADD_I vfnmadd231pd - #endif #endif @@ -363,158 +336,179 @@ .endm /***************************************************************************************************/ -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ + +.macro KERNEL2x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R %xmm8,%xmm4,%xmm0 + VFMADDPD_R %xmm12,%xmm4,%xmm1 + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I %xmm9,%xmm5,%xmm0 + VFMADDPD_I %xmm13,%xmm5,%xmm1 + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPD_R %xmm10,%xmm6,%xmm0 + VFMADDPD_R %xmm14,%xmm6,%xmm1 + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_I %xmm11,%xmm7,%xmm0 + VFMADDPD_I %xmm15,%xmm7,%xmm1 + addq $4, BI + addq $4, %rax +.endm + +.macro SAVE2x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +.endm /************************************************************************************************/ /************************************************************************************************/ -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ + +.macro KERNEL1x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_R %xmm8,%xmm4,%xmm0 + VFMADDPD_I %xmm9,%xmm5,%xmm0 + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_R %xmm10,%xmm6,%xmm0 + VFMADDPD_I %xmm11,%xmm7,%xmm0 + addq $4, BI + addq $2, %rax +.endm + +.macro SAVE1x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +.endm + /************************************************************************************************/ @@ -592,106 +586,128 @@ /************************************************************************************************/ -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ +.macro KERNEL2x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R %xmm8,%xmm4,%xmm0 + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R %xmm12,%xmm4,%xmm1 + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I %xmm9,%xmm5,%xmm0 + VFMADDPD_I %xmm13,%xmm5,%xmm1 + addq $2, BI + addq $4, %rax +.endm + +.macro SAVE2x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + +.endm /************************************************************************************************/ -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ +.macro KERNEL1x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R %xmm8,%xmm4,%xmm0 + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I %xmm9,%xmm5,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE1x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm /************************************************************************************************/ @@ -1056,35 +1072,43 @@ .L2_2_12: + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB je .L2_2_16 + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB je .L2_2_16 @@ -1113,88 +1137,15 @@ .L2_2_17: - KERNEL2x2_SUB(xxx) + KERNEL2x2_SUB + jl .L2_2_17 ALIGN_4 .L2_2_19: - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) + SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -1276,35 +1227,39 @@ .L2_2_42: + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) + KERNEL1x2_SUB + KERNEL1x2_SUB je .L2_2_46 + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) + KERNEL1x2_SUB + KERNEL1x2_SUB je .L2_2_46 @@ -1333,64 +1288,15 @@ .L2_2_47: - KERNEL1x2_SUB(xxx) + KERNEL1x2_SUB + jl .L2_2_47 ALIGN_4 .L2_2_49: - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) + SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -1670,31 +1576,36 @@ .L1_2_12: + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB je .L1_2_16 prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB je .L1_2_16 @@ -1723,65 +1634,15 @@ .L1_2_17: - KERNEL2x1_SUB(xxx) + KERNEL2x1_SUB + jl .L1_2_17 ALIGN_4 .L1_2_19: - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - + SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -1864,31 +1725,35 @@ .L1_2_42: + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB je .L1_2_46 + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB je .L1_2_46 @@ -1917,53 +1782,15 @@ .L1_2_47: - KERNEL1x1_SUB(xxx) + KERNEL1x1_SUB + jl .L1_2_47 ALIGN_4 .L1_2_49: - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) + SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) From 44ef70420c76b88d15d1478bd665b3188031fafc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 16 Aug 2013 18:54:56 +0200 Subject: [PATCH 008/127] added cgemm_kernel_8x2_haswell.S --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 2318 ++++++++++++++++++++++ 1 file changed, 2318 insertions(+) create mode 100644 kernel/x86_64/cgemm_kernel_8x2_haswell.S diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S new file mode 100644 index 000000000..0561c0f72 --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -0,0 +1,2318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/08/16 Saar +* Parameter: +* CGEMM_DEFAULT_UNROLL_N 2 +* CGEMM_DEFAULT_UNROLL_M 8 +* CGEMM_DEFAULT_P 224 +* CGEMM_DEFAULT_Q 224 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 2.04 times faster than sandybridge +* 4 threads: 1.96 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) || defined(PILEDRIVER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +.macro VFMADDPS_R y0,y1,y2 + vfmaddps \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfmaddps \y0,\y1,\y2,\y0 +.endm + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +.macro VFMADDPS_R y0,y1,y2 + vfnmaddps \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfmaddps \y0,\y1,\y2,\y0 +.endm + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +.macro VFMADDPS_R y0,y1,y2 + vfmaddps \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfnmaddps \y0,\y1,\y2,\y0 +.endm + +#else + +.macro VFMADDPS_R y0,y1,y2 + vfnmaddps \y0,\y1,\y2,\y0 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfnmaddps \y0,\y1,\y2,\y0 +.endm + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +.macro VFMADDPS_R y0,y1,y2 + vfmadd231ps \y0,\y1,\y2 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfmadd231ps \y0,\y1,\y2 +.endm + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +.macro VFMADDPS_R y0,y1,y2 + vfnmadd231ps \y0,\y1,\y2 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfmadd231ps \y0,\y1,\y2 +.endm + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +.macro VFMADDPS_R y0,y1,y2 + vfmadd231ps \y0,\y1,\y2 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfnmadd231ps \y0,\y1,\y2 +.endm + +#else + +.macro VFMADDPS_R y0,y1,y2 + vfnmadd231ps \y0,\y1,\y2 +.endm + +.macro VFMADDPS_I y0,y1,y2 + vfnmadd231ps \y0,\y1,\y2 +.endm + +#endif + +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +/***************************************************************************************************************************/ + +.macro KERNEL8x2_SUB + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_R %ymm8,%ymm4,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + VFMADDPS_R %ymm12,%ymm4,%ymm1 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_I %ymm9,%ymm5,%ymm0 + VFMADDPS_I %ymm13,%ymm5,%ymm1 + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_R %ymm10,%ymm6,%ymm0 + VFMADDPS_R %ymm14,%ymm6,%ymm1 + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_I %ymm11,%ymm7,%ymm0 + VFMADDPS_I %ymm15,%ymm7,%ymm1 + addq $4 , BI + addq $16, %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm11, %ymm11, %ymm11 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + vshufps $0xb1, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + vshufps $0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $0xb1, %ymm10, %ymm10, %ymm11 + vshufps $0xb1, %ymm12, %ymm12, %ymm13 + vshufps $0xb1, %ymm14, %ymm14, %ymm15 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm10, %ymm11,%ymm11 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm14, %ymm15,%ymm15 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm11, %ymm10 + vmovaps %ymm13, %ymm12 + vmovaps %ymm15, %ymm14 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm11, %ymm11, %ymm11 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + vshufps $0xb1, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm10, %ymm0, %ymm10 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm11, %ymm1, %ymm11 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm15, %ymm1, %ymm15 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + + vaddps (CO1, LDC), %ymm10, %ymm10 + vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 8 * SIZE(CO1, LDC) + +.endm + +/***************************************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R %xmm8,%xmm4,%xmm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R %xmm12,%xmm4,%xmm1 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I %xmm13,%xmm5,%xmm1 + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R %xmm10,%xmm6,%xmm0 + VFMADDPS_R %xmm14,%xmm6,%xmm1 + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I %xmm11,%xmm7,%xmm0 + VFMADDPS_I %xmm15,%xmm7,%xmm1 + addq $4, BI + addq $8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R %xmm8,%xmm4,%xmm0 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I %xmm9,%xmm5,%xmm0 + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R %xmm10,%xmm6,%xmm0 + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I %xmm11,%xmm7,%xmm0 + addq $4, BI + addq $4, %rax +.endm + +.macro SAVE2x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R %xmm8,%xmm4,%xmm0 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I %xmm9,%xmm5,%xmm0 + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R %xmm10,%xmm6,%xmm0 + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I %xmm11,%xmm7,%xmm0 + addq $4, BI + addq $2, %rax +.endm + +.macro SAVE1x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + vmovsd %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_R %ymm8,%ymm4,%ymm0 + VFMADDPS_R %ymm12,%ymm4,%ymm1 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_I %ymm9,%ymm5,%ymm0 + VFMADDPS_I %ymm13,%ymm5,%ymm1 + addq $2 , BI + addq $16, %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + vshufps $0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $0xb1, %ymm12, %ymm12, %ymm13 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm12, %ymm13,%ymm13 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm13, %ymm12 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm13, %ymm1, %ymm13 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R %xmm8,%xmm4,%xmm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R %xmm12,%xmm4,%xmm1 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I %xmm13,%xmm5,%xmm1 + addq $2, BI + addq $8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R %xmm8,%xmm4,%xmm0 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I %xmm9,%xmm5,%xmm0 + addq $2, BI + addq $4, %rax +.endm + +.macro SAVE2x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R %xmm8,%xmm4,%xmm0 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I %xmm9,%xmm5,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE1x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L2_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + jmp .L2_8_12 + ALIGN_4 + +.L2_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_17: + + KERNEL8x2_SUB + + jl .L2_8_17 + ALIGN_4 + + +.L2_8_19: + + SAVE8x2 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + + + +.L2_4_10: + testq $7, M + jz .L2_4_60 // to next 2 lines of N + + testq $4, M + jz .L2_4_20 + ALIGN_4 + + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_4_20: + + testq $2, M + jz .L2_4_40 + ALIGN_4 + +.L2_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + jmp .L2_4_22 + ALIGN_4 + +.L2_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_27: + + KERNEL2x2_SUB + + jl .L2_4_27 + ALIGN_4 + + +.L2_4_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L2_4_40: + testq $1, M + jz .L2_4_60 // to next 2 lines of N + + ALIGN_4 + +.L2_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + jmp .L2_4_42 + ALIGN_4 + +.L2_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_47: + + KERNEL1x2_SUB + + jl .L2_4_47 + ALIGN_4 + + +.L2_4_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_4_41 + ALIGN_4 + + + + +.L2_4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_4_10 + + ALIGN_4 + +/**************************************************************************************************/ + +.L1_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + jmp .L1_8_12 + ALIGN_4 + +.L1_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_17: + + KERNEL8x1_SUB + + jl .L1_8_17 + ALIGN_4 + + +.L1_8_19: + + SAVE8x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_8_11 + ALIGN_4 + + + +/**************************************************************************************************/ +.L1_4_10: + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_4_20 + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_4_20: + + testq $2, M + jz .L1_4_40 + ALIGN_4 + +.L1_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + jmp .L1_4_22 + ALIGN_4 + +.L1_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_27: + + KERNEL2x1_SUB + + jl .L1_4_27 + ALIGN_4 + + +.L1_4_29: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_4_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + jmp .L1_4_42 + ALIGN_4 + +.L1_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_47: + + KERNEL1x1_SUB + + jl .L1_4_47 + ALIGN_4 + + +.L1_4_49: + + SAVE1x1 + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE From 6c4a7d0828c708b32e4f989662fd9ce4cbfd4c31 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 25 Aug 2013 10:16:01 -0300 Subject: [PATCH 009/127] Import AMD Piledriver DGEMM kernel generated by AUGEM. So far, this kernel doesn't deal with edge. AUGEM: Automatically Generate High Performance Dense Linear Algebra Kernels on x86 CPUs. Qian Wang, Xianyi Zhang, Yunquan Zhang, and Qing Yi. In the International Conference for High Performance Computing, Networking, Storage and Analysis (SC'13). Denver, CO. Nov, 2013. --- kernel/generic/gemm_ncopy_6.c | 230 +++ kernel/generic/gemm_tcopy_6.c | 281 +++ kernel/generic/symm_lcopy_6.c | 138 ++ kernel/generic/symm_ucopy_6.c | 136 ++ kernel/generic/trmm_lncopy_6.c | 484 ++++++ kernel/generic/trmm_ltcopy_6.c | 488 ++++++ kernel/generic/trmm_uncopy_6.c | 785 +++++++++ kernel/generic/trmm_utcopy_6.c | 472 +++++ kernel/generic/trsm_kernel_LN.c | 4 + kernel/generic/trsm_kernel_LT.c | 4 + kernel/generic/trsm_kernel_RN.c | 4 + kernel/generic/trsm_kernel_RT.c | 5 + kernel/generic/trsm_lncopy_6.c | 326 ++++ kernel/generic/trsm_ltcopy_6.c | 346 ++++ kernel/generic/trsm_uncopy_6.c | 350 ++++ kernel/generic/trsm_utcopy_6.c | 322 ++++ kernel/x86_64/KERNEL.PILEDRIVER | 12 +- kernel/x86_64/dgemm_kernel_6x4_piledriver.S | 1734 +++++++++++++++++++ param.h | 10 +- 19 files changed, 6121 insertions(+), 10 deletions(-) create mode 100644 kernel/generic/gemm_ncopy_6.c create mode 100644 kernel/generic/gemm_tcopy_6.c create mode 100644 kernel/generic/symm_lcopy_6.c create mode 100644 kernel/generic/symm_ucopy_6.c create mode 100644 kernel/generic/trmm_lncopy_6.c create mode 100644 kernel/generic/trmm_ltcopy_6.c create mode 100644 kernel/generic/trmm_uncopy_6.c create mode 100644 kernel/generic/trmm_utcopy_6.c create mode 100644 kernel/generic/trsm_lncopy_6.c create mode 100644 kernel/generic/trsm_ltcopy_6.c create mode 100644 kernel/generic/trsm_uncopy_6.c create mode 100644 kernel/generic/trsm_utcopy_6.c create mode 100644 kernel/x86_64/dgemm_kernel_6x4_piledriver.S diff --git a/kernel/generic/gemm_ncopy_6.c b/kernel/generic/gemm_ncopy_6.c new file mode 100644 index 000000000..1ecb93c65 --- /dev/null +++ b/kernel/generic/gemm_ncopy_6.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + *(b_offset + 4) = ctemp2; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp10; + *(b_offset + 7) = ctemp14; + + *(b_offset + 8) = ctemp3; + *(b_offset + 9) = ctemp7; + *(b_offset + 10) = ctemp11; + *(b_offset + 11) = ctemp15; + + *(b_offset + 12) = ctemp4; + *(b_offset + 13) = ctemp8; + *(b_offset + 14) = ctemp12; + *(b_offset + 15) = ctemp16; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_6.c b/kernel/generic/gemm_tcopy_6.c new file mode 100644 index 000000000..bd32090e7 --- /dev/null +++ b/kernel/generic/gemm_tcopy_6.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + *(b_offset1 + 8) = ctemp9; + *(b_offset1 + 9) = ctemp10; + *(b_offset1 + 10) = ctemp11; + *(b_offset1 + 11) = ctemp12; + + *(b_offset1 + 12) = ctemp13; + *(b_offset1 + 13) = ctemp14; + *(b_offset1 + 14) = ctemp15; + *(b_offset1 + 15) = ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + *(b_offset2 + 4) = ctemp5; + *(b_offset2 + 5) = ctemp6; + *(b_offset2 + 6) = ctemp7; + *(b_offset2 + 7) = ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + *(b_offset3 + 2) = ctemp3; + *(b_offset3 + 3) = ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c new file mode 100644 index 000000000..ac04943e2 --- /dev/null +++ b/kernel/generic/symm_lcopy_6.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c new file mode 100644 index 000000000..9b9cff820 --- /dev/null +++ b/kernel/generic/symm_ucopy_6.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c new file mode 100644 index 000000000..6cd16673a --- /dev/null +++ b/kernel/generic/trmm_lncopy_6.c @@ -0,0 +1,484 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = ONE; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else + if (X < posY) { + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c new file mode 100644 index 000000000..69a233be6 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_6.c @@ -0,0 +1,488 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data12; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data12 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data12; + b += 4; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += 1; + b += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c new file mode 100644 index 000000000..70945a246 --- /dev/null +++ b/kernel/generic/trmm_uncopy_6.c @@ -0,0 +1,785 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, mm; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data07, data08, data09, data10, data11, data12; + FLOAT data13, data14, data15, data16, data17, data18; + FLOAT data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data31, data32, data33, data34, data35, data36; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + //js = (n >> 2); + js = n/6; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = m/6; + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + + data07 = *(ao2 + 0); + data08 = *(ao2 + 1); + data09 = *(ao2 + 2); + data10 = *(ao2 + 3); + data11 = *(ao2 + 4); + data12 = *(ao2 + 5); + + data13 = *(ao3 + 0); + data14 = *(ao3 + 1); + data15 = *(ao3 + 2); + data16 = *(ao3 + 3); + data17 = *(ao3 + 4); + data18 = *(ao3 + 5); + + data19 = *(ao4 + 0); + data20 = *(ao4 + 1); + data21 = *(ao4 + 2); + data22 = *(ao4 + 3); + data23 = *(ao4 + 4); + data24 = *(ao4 + 5); + + data25 = *(ao5 + 0); + data26 = *(ao5 + 1); + data27 = *(ao5 + 2); + data28 = *(ao5 + 3); + data29 = *(ao5 + 4); + data30 = *(ao5 + 5); + + data31 = *(ao6 + 0); + data32 = *(ao6 + 1); + data33 = *(ao6 + 2); + data34 = *(ao6 + 3); + data35 = *(ao6 + 4); + data36 = *(ao6 + 5); + + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = data02; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = data03; + b[13] = data09; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = data04; + b[19] = data10; + b[20] = data16; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = data05; + b[25] = data11; + b[26] = data17; + b[27] = data23; + b[28] = data29; + b[29] = data35; + + b[30] = data06; + b[31] = data12; + b[32] = data18; + b[33] = data24; + b[34] = data30; + b[35] = data36; + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } else { + data01 = *(ao1 + 0); + data07 = *(ao2 + 0); + data13 = *(ao3 + 0); + data19 = *(ao4 + 0); + data25 = *(ao5 + 0); + data31 = *(ao6 + 0); + + data08 = *(ao2 + 1); + data14 = *(ao3 + 1); + data20 = *(ao4 + 1); + data26 = *(ao5 + 1); + data32 = *(ao6 + 1); + + data15 = *(ao3 + 2); + data21 = *(ao4 + 2); + data27 = *(ao5 + 2); + data33 = *(ao6 + 2); + + data22 = *(ao4 + 3); + data28 = *(ao5 + 3); + data34 = *(ao6 + 3); + + data29 = *(ao5 + 4); + data35 = *(ao6 + 4); + + data36 = *(ao6 + 5); + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = ONE; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ONE; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ONE; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ONE; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ONE; +#else + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = data29; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = data36; +#endif + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 7; + + b += 36; + } + X += 6; + i --; + } while (i > 0); + } + mm = m - m/6; + if (mm & 4) { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + X += 4; + } + + if (mm & 3) { + if (X < posY) { + if (mm & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (mm & 1) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data05 = *(ao3 + 0); + data07 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c new file mode 100644 index 000000000..7d4dba34b --- /dev/null +++ b/kernel/generic/trmm_utcopy_6.c @@ -0,0 +1,472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data05 = *(ao2 + 0); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = ONE; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b += 4; + } +#else + data01 = *(ao1 + 0); + + if (i >= 2) { + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + } + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = ZERO; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LN.c b/kernel/generic/trsm_kernel_LN.c index 068a202b8..931cba377 100644 --- a/kernel/generic/trsm_kernel_LN.c +++ b/kernel/generic/trsm_kernel_LN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c index 300fdd483..099624252 100644 --- a/kernel/generic/trsm_kernel_LT.c +++ b/kernel/generic/trsm_kernel_LT.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c index b85c3c1e9..d7e650e0c 100644 --- a/kernel/generic/trsm_kernel_RN.c +++ b/kernel/generic/trsm_kernel_RN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c index 2adb3a4f7..a46945330 100644 --- a/kernel/generic/trsm_kernel_RT.c +++ b/kernel/generic/trsm_kernel_RT.c @@ -58,6 +58,11 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c new file mode 100644 index 000000000..9f7bcc2dd --- /dev/null +++ b/kernel/generic/trsm_lncopy_6.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = INV(data11); + + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data05; + *(b + 3) = data07; + *(b + 4) = data02; + *(b + 5) = data04; + *(b + 6) = data06; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c new file mode 100644 index 000000000..d891468a4 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_6.c @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 10) = INV(data11); + *(b + 11) = data12; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c new file mode 100644 index 000000000..837a25019 --- /dev/null +++ b/kernel/generic/trsm_uncopy_6.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 10) = INV(data11); + *(b + 11) = data15; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); + data09 = *(a3 + 0); + data13 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data03; + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data03; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c new file mode 100644 index 000000000..bbba78d53 --- /dev/null +++ b/kernel/generic/trsm_utcopy_6.c @@ -0,0 +1,322 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = INV(data11); + + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 8ebd42244..bbec23ccf 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -16,15 +16,17 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S -DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S -DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S -DGEMMONCOPY = gemm_ncopy_2_bulldozer.S -DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S + +DGEMMKERNEL = dgemm_kernel_6x4_piledriver.S +DGEMMINCOPY = ../generic/gemm_ncopy_6.c +DGEMMITCOPY = ../generic/gemm_tcopy_6.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c diff --git a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S new file mode 100644 index 000000000..7b5dd1587 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S @@ -0,0 +1,1734 @@ +/**************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +// register blocking= 6x4. unloop k = 4. +// Use FMA3 on piledriver. +// Todo: 1) deal with the edge. 2) Add windows abi. + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 128 +#define oldbk_i %rdi +#define oldbk_j %rsi +#define oldbk_l %rdx + +#define _bk_i %r13 +#define _bk_j %r14 +#define _bk_l %r15 + +#define ALPHA %xmm0 +#define _ptr_A %rcx +#define _ptr_B %r8 +#define _ptr_C %r9 +#define LDC %r10 + +#define i %r11 +#define k %rax +#define _pre_B %r12 +#define _ptr__A_0 %rdi +#define _ptr__B_0 %rsi +#define _ptr__C_0 %rbx +#define _ptr__C_1 %rbp + +#define old_ldc 8+STACKSIZE(%rsp) +#define alpha 48(%rsp) +#define j 56(%rsp) + +#define MOVQ2560(s,d) movq s,d +#define LEAQ2560(s,d) leaq s,d +#define SARQ2560(imm,n) sarq imm,n +#define ADDQ2560(off,addr) addq off,addr +#define SUBQ2560(off,addr) subq off,addr +#define DIVQ2560(off,addr) divq off,addr +#define MULQ2560(s,d) mulq s,d +#define DECQ2560(addr) decq addr +#define NEGQ2560(s) negq s +#define TESTQ2560(n,addr) testq n,addr +#define SALQ2560(imm,n) salq imm,n + +#define MOVQ1280(s,d) movq s,d +#define LEAQ1280(s,d) leaq s,d +#define SARQ1280(imm,n) sarq imm,n +#define ADDQ1280(off,addr) addq off,addr +#define SUBQ1280(off,addr) subq off,addr +#define DIVQ1280(off,addr) divq off,addr +#define CMPQ1280(off,addr) cmpq off,addr +#define MULQ1280(s,d) mulq s,d +#define DECQ1280(addr) decq addr +#define NEGQ1280(s) negq s +#define TESTQ1280(n,addr) testq n,addr +#define SALQ1280(imm,n) salq imm,n + +#define JG jg +#define JLE jle + +#define VLD2560(addr,reg) vmovapd addr,reg +#define VST2560(reg,addr) vmovapd reg,addr +#define VMUL2560(a,b,c) vmulpd a,b,c +#define MVMUL2560(a,b,c) vmulpd b,a,c +#define VADD2560(a,b,c) vaddpd a,b,c +#define MVADD2560(a,b,c) vaddpd b,a,c +#define VSHUF2560(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F2560(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD2560(addr,reg) vbroadcastsd addr,reg +#define MOVRR2560(a,b) vmovapd a,b +#define REVS2560(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR2561(imm,a,b) vextractf128 imm,a,b +#define LDL2561(addr,reg) vmovlpd addr,reg,reg +#define LDH2561(addr,reg) vmovhpd addr,reg,reg +#define STL2561(reg,addr) vmovlpd reg,addr +#define STH2561(reg,addr) vmovhpd reg,addr +#define VADD2561(a,b,c) vaddpd a,b,c +#define VXOR2560(a,b,c) vxorpd a,b,c +#define PREFETCH02560(addr,b) prefetcht0 addr +#define PREFETCH12560(addr,b) prefetcht0 addr +#define PREFETCH22560(addr,b) prefetcht2 addr +#define PREFETCHW2560(addr,b) prefetchw addr +#define PREFETCHN2560(addr,b) prefetchnta addr +#define VMA2560(a,b,c,d) vfmaddpd d,a,b,c +#define MVMA2560(a,b,c,d) vfmaddpd d,a,b,c + +#define VLD1280(addr,reg) vmovapd addr,reg +#define VLD1282(addr,reg) vmovapd addr,reg +#define VLD1281(addr,reg) movsd addr,reg +#define VST1280(reg,addr) vmovapd reg,addr +#define VST1282(reg,addr) vmovapd reg,addr +#define VST1281(reg,addr) movsd reg,addr +#define VLDU1282(addr,reg) vmovupd addr,reg +#define VLDU1281(addr,reg) movsd addr,reg +#define VSTU1282(reg,addr) vmovupd reg,addr +#define VSTU1281(reg,addr) movsd reg,addr +#define VMUL1280(a,b,c) vmulpd a,b,c +#define VMUL1282(a,b,c) vmulpd a,b,c +#define VMUL1281(a,b,c) vmulpd a,b,c +#define MVMUL1280(a,b,c) vmulpd b,a,c +#define VADD1280(a,b,c) vaddpd a,b,c +#define MVADD1280(a,b,c) vaddpd b,a,c +#define VSHUF1280(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F1280(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD1280(addr,reg) vmovddup addr,reg +#define BROAD1282(addr,reg) vmovddup addr,reg +#define BROAD1281(addr,reg) movddup addr,reg +#define MOVRR1280(a,b) vmovapd a,b +#define REVS1280(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR1281(imm,a,b) vextractf128 imm,a,b +#define LDL1281(addr,reg) vmovlpd addr,reg,reg +#define LDH1281(addr,reg) vmovhpd addr,reg,reg +#define STL1281(reg,addr) vmovlpd reg,addr +#define STH1281(reg,addr) vmovhpd reg,addr +#define VADD1281(a,b,c) vaddpd a,b,c +#define VXOR1280(a,b,c) vxorpd a,b,c +#define VXOR1282(a,b,c) vxorpd a,b,c +#define VXOR1281(a,b,c) vxorpd a,b,c +#define PREFETCH01280(addr,b) prefetcht0 addr +#define PREFETCH11280(addr,b) prefetcht0 addr +#define PREFETCH21280(addr,b) prefetcht2 addr +#define PREFETCHW1280(addr,b) prefetchw addr +#define PREFETCHN1280(addr,b) prefetchnta addr +#define VMA1280(a,b,c,d) vfmaddpd d,a,b,c +#define VMA1282(a,b,c,d) vfmadd231pd a,b,c +#define VMA1281(a,b,c,d) vfmadd231pd a,b,c +#define VMA21282(a,b,c,d) vfmadd231pd a,b,c +#define VMA21281(a,b,c,d) vfmadd231pd a,b,c +//#define VMA1282(a,b,c,d) nop +//#define VMA1281(a,b,c,d) nop +//#define VMA21282(a,b,c,d) nop +//#define VMA21281(a,b,c,d) nop +#define MVMA1280(a,b,c,d) vfmaddpd d,a,b,c + +#define imm1 $0x05 +#define imm3 $0x05 +#define imm100 $0x05 +#define imm200 $0x0a + +#define XMM0 %xmm0 +#define XMM1 %xmm1 +#define XMM2 %xmm2 +#define XMM3 %xmm3 +#define XMM4 %xmm4 +#define XMM5 %xmm5 +#define XMM6 %xmm6 +#define XMM7 %xmm7 +#define XMM8 %xmm8 +#define XMM9 %xmm9 +#define XMM10 %xmm10 +#define XMM11 %xmm11 +#define XMM12 %xmm12 +#define XMM13 %xmm13 +#define XMM14 %xmm14 +#define XMM15 %xmm15 + +#define YMM0 %ymm0 +#define YMM1 %ymm1 +#define YMM2 %ymm2 +#define YMM3 %ymm3 +#define YMM4 %ymm4 +#define YMM5 %ymm5 +#define YMM6 %ymm6 +#define YMM7 %ymm7 +#define YMM8 %ymm8 +#define YMM9 %ymm9 +#define YMM10 %ymm10 +#define YMM11 %ymm11 +#define YMM12 %ymm12 +#define YMM13 %ymm13 +#define YMM14 %ymm14 +#define YMM15 %ymm15 +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); +vzeroupper +movl old_ldc, %eax +movq %rax, LDC +movlps ALPHA, alpha +movq oldbk_i, _bk_i +movq oldbk_j, _bk_j +movq oldbk_l, _bk_l +leaq (, LDC, SIZE), LDC + +MOVQ1280(_bk_j,j); +SARQ1280($2,j); +JLE ._L_0_loopE; +ALIGN_4; +._L_0_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,2),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_1_loopE; +._L_1_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +VXOR1282(XMM8,XMM8,XMM8); +VXOR1282(XMM9,XMM9,XMM9); +VXOR1282(XMM10,XMM10,XMM10); +VXOR1282(XMM11,XMM11,XMM11); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_2_loopE; +ALIGN_4; +._L_2_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_2_bodyE:; +DECQ1280(k); +JG ._L_2_bodyB; +ALIGN_4; +._L_2_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_3_loopE; +ALIGN_4; +._L_3_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_3_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_4_loopE; +ALIGN_4; +._L_4_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_4_loopE:; +BROAD1282(alpha,XMM12); +VLDU1282(0*SIZE(_ptr__C_0),XMM13); +VMA21282(XMM12,XMM0,XMM13,XMM0); +VSTU1282(XMM13,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM14); +VMA21282(XMM12,XMM1,XMM14,XMM1); +VSTU1282(XMM14,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM15); +VMA21282(XMM12,XMM2,XMM15,XMM2); +VSTU1282(XMM15,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM13); +VMA21282(XMM12,XMM3,XMM13,XMM3); +VSTU1282(XMM13,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM14); +VMA21282(XMM12,XMM4,XMM14,XMM4); +VSTU1282(XMM14,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM15); +VMA21282(XMM12,XMM5,XMM15,XMM5); +VSTU1282(XMM15,4*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM12,XMM6,XMM13,XMM6); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM12,XMM7,XMM14,XMM7); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(4*SIZE(_ptr__C_1),XMM15); +VMA21282(XMM12,XMM8,XMM15,XMM8); +VSTU1282(XMM15,4*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM13); +VMA21282(XMM12,XMM9,XMM13,XMM9); +VSTU1282(XMM13,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM14); +VMA21282(XMM12,XMM10,XMM14,XMM10); +VSTU1282(XMM14,2*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM12,XMM11,XMM15,XMM11); +VSTU1282(XMM15,4*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_1_bodyE:; +SUBQ1280($6,i); +JG ._L_1_bodyB; +ALIGN_4; +._L_1_loopE:; +TESTQ1280($4,i); +JLE ._L_5_loopE; +ALIGN_4; +._L_5_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_6_loopE; +ALIGN_4; +._L_6_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_6_bodyE:; +DECQ1280(k); +JG ._L_6_bodyB; +ALIGN_4; +._L_6_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_7_loopE; +ALIGN_4; +._L_7_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_7_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_8_loopE; +ALIGN_4; +._L_8_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_8_loopE:; +BROAD1282(alpha,XMM8); +VLDU1282(0*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM8,XMM0,XMM9,XMM0); +VSTU1282(XMM9,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM10); +VMA21282(XMM8,XMM1,XMM10,XMM1); +VSTU1282(XMM10,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM8,XMM2,XMM11,XMM2); +VSTU1282(XMM11,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM8,XMM3,XMM12,XMM3); +VSTU1282(XMM12,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM8,XMM4,XMM13,XMM4); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM8,XMM5,XMM14,XMM5); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM8,XMM6,XMM15,XMM6); +VSTU1282(XMM15,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM9); +VMA21282(XMM8,XMM7,XMM9,XMM7); +VSTU1282(XMM9,2*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_5_loopE:; +TESTQ1280($2,i); +JLE ._L_9_loopE; +ALIGN_4; +._L_9_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_10_loopE; +ALIGN_4; +._L_10_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_10_bodyE:; +DECQ1280(k); +JG ._L_10_bodyB; +ALIGN_4; +._L_10_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_11_loopE; +ALIGN_4; +._L_11_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_11_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_12_loopE; +ALIGN_4; +._L_12_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_12_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_9_loopE:; +TESTQ1280($1,i); +JLE ._L_13_loopE; +ALIGN_4; +._L_13_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +VXOR1281(XMM2,XMM2,XMM2); +VXOR1281(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_14_loopE; +ALIGN_4; +._L_14_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(8*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(9*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(10*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(11*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(12*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(13*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(14*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(15*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_14_bodyE:; +DECQ1280(k); +JG ._L_14_bodyB; +ALIGN_4; +._L_14_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_15_loopE; +ALIGN_4; +._L_15_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_15_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_16_loopE; +ALIGN_4; +._L_16_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_16_loopE:; +BROAD1281(alpha,XMM4); +VLDU1281(0*SIZE(_ptr__C_0),XMM5); +VMA21281(XMM4,XMM0,XMM5,XMM0); +VSTU1281(XMM5,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21281(XMM4,XMM1,XMM6,XMM1); +VSTU1281(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1281(0*SIZE(_ptr__C_1),XMM7); +VMA21281(XMM4,XMM2,XMM7,XMM2); +VSTU1281(XMM7,0*SIZE(_ptr__C_1)); +VLDU1281(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21281(XMM4,XMM3,XMM8,XMM3); +VSTU1281(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_13_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($2,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_ptr_B); +._L_0_bodyE:; +DECQ1280(j); +JG ._L_0_bodyB; +ALIGN_4; +._L_0_loopE:; +TESTQ1280($2,_bk_j); +JLE ._L_17_loopE; +ALIGN_4; +._L_17_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,1),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_18_loopE; +._L_18_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_19_loopE; +ALIGN_4; +._L_19_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_19_bodyE:; +DECQ1280(k); +JG ._L_19_bodyB; +ALIGN_4; +._L_19_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_20_loopE; +ALIGN_4; +._L_20_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_20_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_21_loopE; +ALIGN_4; +._L_21_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_21_loopE:; +BROAD1282(alpha,XMM6); +VLDU1282(0*SIZE(_ptr__C_0),XMM7); +VMA21282(XMM6,XMM0,XMM7,XMM0); +VSTU1282(XMM7,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM8); +VMA21282(XMM6,XMM1,XMM8,XMM1); +VSTU1282(XMM8,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM6,XMM2,XMM9,XMM2); +VSTU1282(XMM9,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM10); +VMA21282(XMM6,XMM3,XMM10,XMM3); +VSTU1282(XMM10,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM6,XMM4,XMM11,XMM4); +VSTU1282(XMM11,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM6,XMM5,XMM12,XMM5); +VSTU1282(XMM12,4*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_18_bodyE:; +SUBQ1280($6,i); +JG ._L_18_bodyB; +ALIGN_4; +._L_18_loopE:; +TESTQ1280($4,i); +JLE ._L_22_loopE; +ALIGN_4; +._L_22_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_23_loopE; +ALIGN_4; +._L_23_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_23_bodyE:; +DECQ1280(k); +JG ._L_23_bodyB; +ALIGN_4; +._L_23_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_24_loopE; +ALIGN_4; +._L_24_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_24_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_25_loopE; +ALIGN_4; +._L_25_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_25_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,2*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_22_loopE:; +TESTQ1280($2,i); +JLE ._L_26_loopE; +ALIGN_4; +._L_26_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_27_loopE; +ALIGN_4; +._L_27_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_27_bodyE:; +DECQ1280(k); +JG ._L_27_bodyB; +ALIGN_4; +._L_27_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_28_loopE; +ALIGN_4; +._L_28_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_28_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_29_loopE; +ALIGN_4; +._L_29_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_29_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_26_loopE:; +TESTQ1280($1,i); +JLE ._L_30_loopE; +ALIGN_4; +._L_30_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_31_loopE; +ALIGN_4; +._L_31_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_31_bodyE:; +DECQ1280(k); +JG ._L_31_bodyB; +ALIGN_4; +._L_31_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_32_loopE; +ALIGN_4; +._L_32_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_32_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_33_loopE; +ALIGN_4; +._L_33_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_33_loopE:; +BROAD1281(alpha,XMM2); +VLDU1281(0*SIZE(_ptr__C_0),XMM3); +VMA21281(XMM2,XMM0,XMM3,XMM0); +VSTU1281(XMM3,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21281(XMM2,XMM1,XMM4,XMM1); +VSTU1281(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_30_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($1,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_ptr_B); +._L_17_loopE:; +TESTQ1280($1,_bk_j); +JLE ._L_34_loopE; +ALIGN_4; +._L_34_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_35_loopE; +._L_35_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_36_loopE; +ALIGN_4; +._L_36_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_36_bodyE:; +DECQ1280(k); +JG ._L_36_bodyB; +ALIGN_4; +._L_36_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_37_loopE; +ALIGN_4; +._L_37_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_37_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_38_loopE; +ALIGN_4; +._L_38_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_38_loopE:; +BROAD1282(alpha,XMM3); +VLDU1282(0*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM3,XMM0,XMM4,XMM0); +VSTU1282(XMM4,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM3,XMM1,XMM5,XMM1); +VSTU1282(XMM5,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM3,XMM2,XMM6,XMM2); +VSTU1282(XMM6,4*SIZE(_ptr__C_0)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_35_bodyE:; +SUBQ1280($6,i); +JG ._L_35_bodyB; +ALIGN_4; +._L_35_loopE:; +TESTQ1280($4,i); +JLE ._L_39_loopE; +ALIGN_4; +._L_39_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_40_loopE; +ALIGN_4; +._L_40_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_40_bodyE:; +DECQ1280(k); +JG ._L_40_bodyB; +ALIGN_4; +._L_40_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_41_loopE; +ALIGN_4; +._L_41_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_41_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_42_loopE; +ALIGN_4; +._L_42_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_42_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,2*SIZE(_ptr__C_0)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_39_loopE:; +TESTQ1280($2,i); +JLE ._L_43_loopE; +ALIGN_4; +._L_43_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_44_loopE; +ALIGN_4; +._L_44_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_44_bodyE:; +DECQ1280(k); +JG ._L_44_bodyB; +ALIGN_4; +._L_44_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_45_loopE; +ALIGN_4; +._L_45_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_45_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_46_loopE; +ALIGN_4; +._L_46_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_46_loopE:; +BROAD1282(alpha,XMM1); +VLDU1282(0*SIZE(_ptr__C_0),XMM2); +VMA21282(XMM1,XMM0,XMM2,XMM0); +VSTU1282(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_43_loopE:; +TESTQ1280($1,i); +JLE ._L_47_loopE; +ALIGN_4; +._L_47_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_48_loopE; +ALIGN_4; +._L_48_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_48_bodyE:; +DECQ1280(k); +JG ._L_48_bodyB; +ALIGN_4; +._L_48_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_49_loopE; +ALIGN_4; +._L_49_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_49_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_50_loopE; +ALIGN_4; +._L_50_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_50_loopE:; +BROAD1281(alpha,XMM1); +VLDU1281(0*SIZE(_ptr__C_0),XMM2); +VMA21281(XMM1,XMM0,XMM2,XMM0); +VSTU1281(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_47_loopE:; +MOVQ1280(LDC,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_ptr_B); +._L_34_loopE:; +vzeroupper +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/param.h b/param.h index 0c3df6951..c7a9635a6 100644 --- a/param.h +++ b/param.h @@ -330,9 +330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 6 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 @@ -347,10 +347,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 480 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 #endif #define QGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 168 -#define DGEMM_DEFAULT_Q 168 +#define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 From 70411af888f220c205d457ab7e653a10a0e18108 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 28 Sep 2013 19:02:25 +0200 Subject: [PATCH 010/127] initial checkin of kernel/arm --- kernel/arm/KERNEL | 46 + kernel/arm/KERNEL.ARMV7 | 143 +++ kernel/arm/amax.c | 73 ++ kernel/arm/amin.c | 73 ++ kernel/arm/asum.c | 67 ++ kernel/arm/axpy.c | 64 ++ kernel/arm/copy.c | 59 ++ kernel/arm/dgemm_kernel_8x2_vfpv3.S | 1223 +++++++++++++++++++++ kernel/arm/dot.c | 64 ++ kernel/arm/dtrmm_kernel_8x2_vfpv3.S | 1521 +++++++++++++++++++++++++++ kernel/arm/gemv_n.c | 67 ++ kernel/arm/gemv_t.c | 67 ++ kernel/arm/iamax.c | 75 ++ kernel/arm/iamin.c | 75 ++ kernel/arm/imax.c | 67 ++ kernel/arm/imin.c | 65 ++ kernel/arm/izamax.c | 81 ++ kernel/arm/izamin.c | 81 ++ kernel/arm/max.c | 63 ++ kernel/arm/min.c | 63 ++ kernel/arm/nrm2.c | 88 ++ kernel/arm/rot.c | 62 ++ kernel/arm/scal.c | 58 + kernel/arm/swap.c | 62 ++ kernel/arm/zamax.c | 81 ++ kernel/arm/zamin.c | 81 ++ kernel/arm/zasum.c | 71 ++ kernel/arm/zaxpy.c | 72 ++ kernel/arm/zcopy.c | 63 ++ kernel/arm/zdot.c | 78 ++ kernel/arm/zgemv_n.c | 125 +++ kernel/arm/zgemv_t.c | 131 +++ kernel/arm/znrm2.c | 106 ++ kernel/arm/zrot.c | 68 ++ kernel/arm/zscal.c | 64 ++ kernel/arm/zswap.c | 70 ++ 36 files changed, 5317 insertions(+) create mode 100644 kernel/arm/KERNEL create mode 100644 kernel/arm/KERNEL.ARMV7 create mode 100644 kernel/arm/amax.c create mode 100644 kernel/arm/amin.c create mode 100644 kernel/arm/asum.c create mode 100644 kernel/arm/axpy.c create mode 100644 kernel/arm/copy.c create mode 100644 kernel/arm/dgemm_kernel_8x2_vfpv3.S create mode 100644 kernel/arm/dot.c create mode 100644 kernel/arm/dtrmm_kernel_8x2_vfpv3.S create mode 100644 kernel/arm/gemv_n.c create mode 100644 kernel/arm/gemv_t.c create mode 100644 kernel/arm/iamax.c create mode 100644 kernel/arm/iamin.c create mode 100644 kernel/arm/imax.c create mode 100644 kernel/arm/imin.c create mode 100644 kernel/arm/izamax.c create mode 100644 kernel/arm/izamin.c create mode 100644 kernel/arm/max.c create mode 100644 kernel/arm/min.c create mode 100644 kernel/arm/nrm2.c create mode 100644 kernel/arm/rot.c create mode 100644 kernel/arm/scal.c create mode 100644 kernel/arm/swap.c create mode 100644 kernel/arm/zamax.c create mode 100644 kernel/arm/zamin.c create mode 100644 kernel/arm/zasum.c create mode 100644 kernel/arm/zaxpy.c create mode 100644 kernel/arm/zcopy.c create mode 100644 kernel/arm/zdot.c create mode 100644 kernel/arm/zgemv_n.c create mode 100644 kernel/arm/zgemv_t.c create mode 100644 kernel/arm/znrm2.c create mode 100644 kernel/arm/zrot.c create mode 100644 kernel/arm/zscal.c create mode 100644 kernel/arm/zswap.c diff --git a/kernel/arm/KERNEL b/kernel/arm/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 new file mode 100644 index 000000000..a60701002 --- /dev/null +++ b/kernel/arm/KERNEL.ARMV7 @@ -0,0 +1,143 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SGEMVNKERNEL = gemv_n.c +DGEMVNKERNEL = gemv_n.c +CGEMVNKERNEL = zgemv_n.c +ZGEMVNKERNEL = zgemv_n.c + +SGEMVTKERNEL = gemv_t.c +DGEMVTKERNEL = gemv_t.c +CGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = dtrmm_kernel_8x2_vfpv3.S +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +#DGEMMKERNEL = ../generic/gemmkernel_2x2.c +#DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S +DGEMMKERNEL = dgemm_kernel_8x2_vfpv3.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S + + + + diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c new file mode 100644 index 000000000..55107ca4f --- /dev/null +++ b/kernel/arm/amax.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c new file mode 100644 index 000000000..3f7e97be6 --- /dev/null +++ b/kernel/arm/amin.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c new file mode 100644 index 000000000..5ac6936a0 --- /dev/null +++ b/kernel/arm/asum.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n < 0 || inc_x < 1 ) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/arm/axpy.c b/kernel/arm/axpy.c new file mode 100644 index 000000000..dceddf78a --- /dev/null +++ b/kernel/arm/axpy.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/copy.c b/kernel/arm/copy.c new file mode 100644 index 000000000..f742a4a33 --- /dev/null +++ b/kernel/arm/copy.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S new file mode 100644 index 000000000..3c474a172 --- /dev/null +++ b/kernel/arm/dgemm_kernel_8x2_vfpv3.S @@ -0,0 +1,1223 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/09/22 Saar +* UNROLL_N 2 +* UNROLL_M 8 +* DGEMM_P 64 +* DGEMM_Q 64 +* DGEMM_R 512 +* A_PRE 128 +* B_PRE 128 +* +* Performance on Odroid U2: +* +* 1 Core: 0.92 GFLOPS ATLAS: 0.81 GFLOPS +* 2 Cores: 1.83 GFLOPS ATLAS: 1.51 GFLOPS +* 3 Cores: 2.67 GFLOPS ATLAS: 1.51 GFLOPS +* 4 Cores: 3.52 GFLOPS ATLAS: 1.51 GFLOPS +* +* 2013/09/28 Saar +* UNROLL_N 2 +* UNROLL_M 8 +* DGEMM_P 128 +* DGEMM_Q 128 +* DGEMM_R 2048 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 +* +* Performance on Odroid U2: +* +* 1 Core: 0.99 GFLOPS ATLAS: 0.82 GFLOPS +* 2 Cores: 1.97 GFLOPS ATLAS: 1.59 GFLOPS +* 3 Cores: 2.86 GFLOPS ATLAS: 1.59 GFLOPS +* 4 Cores: 3.79 GFLOPS ATLAS: 1.59 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 128 +#define A_PRE1 160 +#define B_PRE 128 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT8x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + vmov.f64 d16, d8 + vmov.f64 d17, d8 + vmov.f64 d18, d8 + vmov.f64 d19, d8 + vmov.f64 d20, d8 + vmov.f64 d21, d8 + vmov.f64 d22, d8 + vmov.f64 d23, d8 + +.endm + +.macro KERNEL8x2_START + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_M + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_END + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + + + +.macro KERNEL8x2 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + +.macro SAVE8x2 + + vldr d0, ALPHA + vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vmul.f64 d16, d0 , d16 + vmul.f64 d17, d0 , d17 + vmul.f64 d18, d0 , d18 + vmul.f64 d19, d0 , d19 + vmul.f64 d20, d0 , d20 + vmul.f64 d21, d0 , d21 + vmul.f64 d22, d0 , d22 + vmul.f64 d23, d0 , d23 + + vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } + + vadd.f64 d16, d16, d24 + vadd.f64 d17, d17, d25 + vadd.f64 d18, d18, d26 + vadd.f64 d19, d19, d27 + + vadd.f64 d20, d20, d28 + vadd.f64 d21, d21, d29 + vadd.f64 d22, d22, d30 + vadd.f64 d23, d23, d31 + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } + +.endm + + +/*************************************************************************************/ + + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL4x2 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + + vmul.f64 d6 , d2 , d5 + vmul.f64 d7 , d3 , d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + +.endm + +.macro SAVE4x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vldm CO1, { d0, d1 , d2 , d3 } + vldm CO2, { d4, d5 , d6 , d7 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO1!, { d8 , d9 , d10 , d11 } + vstm CO2!, { d12, d13 ,d14 , d15 } + +.endm + + + +/*************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL2x2 + + vldm AO!, { d0, d1 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + +.endm + +.macro SAVE2x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + + vldm CO1, { d0, d1 } + vldm CO2, { d4, d5 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + + vstm CO1!, { d8 , d9 } + vstm CO2!, { d12, d13 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d12, d12, d12 + +.endm + +.macro KERNEL1x2 + + vldm AO!, { d0 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + + vmul.f64 d6 , d0 , d5 + vadd.f64 d12, d12, d6 + +.endm + +.macro SAVE1x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d12, d0 , d12 + + vldm CO1, { d0 } + vldm CO2, { d4 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d12, d12, d4 + + vstm CO1!, { d8 } + vstm CO2!, { d12} + +.endm + +/*************************************************************************************/ + +.macro INIT8x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL8x1 + + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + vldm BO!, { d24 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d27 , d1 , d24 + vadd.f64 d8 , d8 , d26 + vadd.f64 d9 , d9 , d27 + + vmul.f64 d28 , d2 , d24 + vmul.f64 d29 , d3 , d24 + vadd.f64 d10 , d10, d28 + vadd.f64 d11 , d11, d29 + + vmul.f64 d26 , d4 , d24 + vmul.f64 d27 , d5 , d24 + vadd.f64 d12 , d12, d26 + vadd.f64 d13 , d13, d27 + + vmul.f64 d28 , d6 , d24 + vmul.f64 d29 , d7 , d24 + vadd.f64 d14 , d14, d28 + vadd.f64 d15 , d15, d29 + + +.endm + +.macro SAVE8x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vldm CO1, { d0, d1 , d2 , d3 , d4 , d5 , d6 , d7 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } + +.endm + + +/*************************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + +.endm + +.macro KERNEL4x1 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + +.endm + +.macro SAVE4x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + + vldm CO1, { d0, d1 , d2 , d3 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vstm CO1!, { d8 , d9 , d10 , d11 } + +.endm + +/*************************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + +.endm + +.macro KERNEL2x1 + + vldm AO!, { d0, d1 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + +.endm + +.macro SAVE2x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + + vldm CO1, { d0, d1 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + + vstm CO1!, { d8 , d9 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1 + + vldm AO!, { d0 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + +.endm + +.macro SAVE1x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + + vldm CO1, { d0 } + + vadd.f64 d8 , d8 , d0 + + vstm CO1!, { d8 } + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add CO2, CO1, r4 // CO2 = C + LDC + add r3 , CO2, r4 // C = CO2 + LDC + str r3 , C // store C + + ldr AO, A // AO = A + +_L2_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L2_M4_BEGIN + +_L2_M8_20: + + pld [CO1, #C_PRE] + pld [CO2, #C_PRE] + INIT8x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M8_40 + .align 5 + +_L2_M8_22: + + pld [BO , #B_PRE] + KERNEL8x2_START + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_END + + subs L, L, #1 + bgt _L2_M8_22 + + +_L2_M8_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M8_100 + +_L2_M8_42: + + KERNEL8x2 + + subs L, L, #1 + bgt _L2_M8_42 + +_L2_M8_100: + + SAVE8x2 + +_L2_M8_END: + + subs I, I, #1 + bgt _L2_M8_20 + + +_L2_M4_BEGIN: + + ldr I, M + tst I , #7 + ble _L2_END + + tst I , #4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2 + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +_L2_M4_END: + + + +_L2_M2_BEGIN: + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #3 // L = L % 4 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2 + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #3 // L = L % 4 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2 + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + +_L1_BEGIN: + + ldr J, N + tst J , #1 // J = J % 2 + ble _L999 + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , CO1, r4 // C = CO1 + LDC + str r3 , C // store C + + ldr AO, A // AO = A + + + +_L1_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L1_M4_BEGIN + +_L1_M8_20: + + INIT8x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M8_40 + +_L1_M8_22: + + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + + subs L, L, #1 + bgt _L1_M8_22 + + +_L1_M8_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M8_100 + +_L1_M8_42: + + KERNEL8x1 + + subs L, L, #1 + bgt _L1_M8_42 + +_L1_M8_100: + + SAVE8x1 + +_L1_M8_END: + + subs I, I, #1 + bgt _L1_M8_20 + + + + +_L1_M4_BEGIN: + + ldr I, M + tst I, #7 // I = I % 8 + ble _L1_END + + tst I, #4 // I = I % 8 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L1_M4_40 + +_L1_M4_22: + + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #3 // L = L % 4 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1 + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +_L1_M4_END: + + + + +_L1_M2_BEGIN: + + tst I, #2 // I = I % 4 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #3 // L = L % 4 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1 + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 4 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #3 // L = L % 4 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1 + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dot.c b/kernel/arm/dot.c new file mode 100644 index 000000000..30490e291 --- /dev/null +++ b/kernel/arm/dot.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/arm/dtrmm_kernel_8x2_vfpv3.S b/kernel/arm/dtrmm_kernel_8x2_vfpv3.S new file mode 100644 index 000000000..930616635 --- /dev/null +++ b/kernel/arm/dtrmm_kernel_8x2_vfpv3.S @@ -0,0 +1,1521 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 128 +#define A_PRE1 160 +#define B_PRE 128 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT8x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + vmov.f64 d16, d8 + vmov.f64 d17, d8 + vmov.f64 d18, d8 + vmov.f64 d19, d8 + vmov.f64 d20, d8 + vmov.f64 d21, d8 + vmov.f64 d22, d8 + vmov.f64 d23, d8 + +.endm + +.macro KERNEL8x2_START + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_M + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_END + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + + + +.macro KERNEL8x2 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + +.macro SAVE8x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vmul.f64 d16, d0 , d16 + vmul.f64 d17, d0 , d17 + vmul.f64 d18, d0 , d18 + vmul.f64 d19, d0 , d19 + vmul.f64 d20, d0 , d20 + vmul.f64 d21, d0 , d21 + vmul.f64 d22, d0 , d22 + vmul.f64 d23, d0 , d23 + + vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } + +.endm + + +/*************************************************************************************/ + + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL4x2 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + + vmul.f64 d6 , d2 , d5 + vmul.f64 d7 , d3 , d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + +.endm + +.macro SAVE4x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vstm CO1!, { d8 , d9 , d10 , d11 } + vstm CO2!, { d12, d13 ,d14 , d15 } + +.endm + + + +/*************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL2x2 + + vldm AO!, { d0, d1 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + +.endm + +.macro SAVE2x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + + vstm CO1!, { d8 , d9 } + vstm CO2!, { d12, d13 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d12, d12, d12 + +.endm + +.macro KERNEL1x2 + + vldm AO!, { d0 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + + vmul.f64 d6 , d0 , d5 + vadd.f64 d12, d12, d6 + +.endm + +.macro SAVE1x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d12, d0 , d12 + + vstm CO1!, { d8 } + vstm CO2!, { d12} + +.endm + +/*************************************************************************************/ + +.macro INIT8x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL8x1 + + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + vldm BO!, { d24 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d27 , d1 , d24 + vadd.f64 d8 , d8 , d26 + vadd.f64 d9 , d9 , d27 + + vmul.f64 d28 , d2 , d24 + vmul.f64 d29 , d3 , d24 + vadd.f64 d10 , d10, d28 + vadd.f64 d11 , d11, d29 + + vmul.f64 d26 , d4 , d24 + vmul.f64 d27 , d5 , d24 + vadd.f64 d12 , d12, d26 + vadd.f64 d13 , d13, d27 + + vmul.f64 d28 , d6 , d24 + vmul.f64 d29 , d7 , d24 + vadd.f64 d14 , d14, d28 + vadd.f64 d15 , d15, d29 + + +.endm + +.macro SAVE8x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } + +.endm + + +/*************************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + +.endm + +.macro KERNEL4x1 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + +.endm + +.macro SAVE4x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + + vstm CO1!, { d8 , d9 , d10 , d11 } + +.endm + +/*************************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + +.endm + +.macro KERNEL2x1 + + vldm AO!, { d0, d1 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + +.endm + +.macro SAVE2x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + + vstm CO1!, { d8 , d9 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1 + + vldm AO!, { d0 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + +.endm + +.macro SAVE1x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + + vstm CO1!, { d8 } + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add CO2, CO1, r4 // CO2 = C + LDC + add r3 , CO2, r4 // C = CO2 + LDC + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L2_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L2_M4_BEGIN + +_L2_M8_20: + + pld [CO1, #C_PRE] + pld [CO2, #C_PRE] + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #8 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1 , L, #3 // L = L / 8 + ble _L2_M8_40 + .align 5 + +_L2_M8_22: + + pld [BO , #B_PRE] + KERNEL8x2_START + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_END + + subs K1, K1, #1 + bgt _L2_M8_22 + + +_L2_M8_40: + + ands K1 , L, #7 // L = L % 8 + ble _L2_M8_100 + +_L2_M8_42: + + KERNEL8x2 + + subs K1, K1, #1 + bgt _L2_M8_42 + +_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #8 // number of values in AO + str r3 , KK +#endif + + +_L2_M8_END: + + subs I, I, #1 + bgt _L2_M8_20 + + +_L2_M4_BEGIN: + + ldr I, M + tst I , #7 + ble _L2_END + + tst I , #4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + subs K1, K1, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands K1, L, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2 + + subs K1, K1, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + +_L2_M4_END: + + + +_L2_M2_BEGIN: + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + + subs K1, K1, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands K1, L, #3 // L = L % 4 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2 + + subs K1, K1, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + + subs K1, K1, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands K1, L, #3 // L = L % 4 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2 + + subs K1, K1, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + +_L1_BEGIN: + + ldr J, N + tst J , #1 // J = J % 2 + ble _L999 + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , CO1, r4 // C = CO1 + LDC + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + + +_L1_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L1_M4_BEGIN + +_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #8 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #3 // L = L / 8 + ble _L1_M8_40 + +_L1_M8_22: + + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + + subs K1, K1, #1 + bgt _L1_M8_22 + + +_L1_M8_40: + + ands K1, L, #7 // L = L % 8 + ble _L1_M8_100 + +_L1_M8_42: + + KERNEL8x1 + + subs K1, K1, #1 + bgt _L1_M8_42 + +_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #8 // number of values in AO + str r3 , KK +#endif + + +_L1_M8_END: + + subs I, I, #1 + bgt _L1_M8_20 + + + + +_L1_M4_BEGIN: + + ldr I, M + tst I, #7 // I = I % 8 + ble _L1_END + + tst I, #4 // I = I % 8 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L1_M4_40 + +_L1_M4_22: + + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + + subs K1, K1, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands K1, L, #3 // L = L % 4 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1 + + subs K1, K1, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + +_L1_M4_END: + + + + +_L1_M2_BEGIN: + + tst I, #2 // I = I % 4 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + + subs K1, K1, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands K1 , L, #3 // L = L % 4 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1 + + subs K1, K1, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L1_M2_END: + + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 4 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + + subs K1, K1, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands K1 , L, #3 // L = L % 4 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1 + + subs K1, K1, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/gemv_n.c b/kernel/arm/gemv_n.c new file mode 100644 index 000000000..aedcca965 --- /dev/null +++ b/kernel/arm/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c new file mode 100644 index 000000000..fdb5d7a10 --- /dev/null +++ b/kernel/arm/iamin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c new file mode 100644 index 000000000..e3e4b9a6c --- /dev/null +++ b/kernel/arm/imax.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c new file mode 100644 index 000000000..fbcadc2fd --- /dev/null +++ b/kernel/arm/imin.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=x[0]; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c new file mode 100644 index 000000000..a6ba86388 --- /dev/null +++ b/kernel/arm/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(max); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c new file mode 100644 index 000000000..45c2a7c9c --- /dev/null +++ b/kernel/arm/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(min); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/max.c b/kernel/arm/max.c new file mode 100644 index 000000000..3239e3408 --- /dev/null +++ b/kernel/arm/max.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/min.c b/kernel/arm/min.c new file mode 100644 index 000000000..de4c4719a --- /dev/null +++ b/kernel/arm/min.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=x[0]; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c new file mode 100644 index 000000000..d65c5a410 --- /dev/null +++ b/kernel/arm/nrm2.c @@ -0,0 +1,88 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n < 0 || inc_x < 1 ) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/rot.c b/kernel/arm/rot.c new file mode 100644 index 000000000..aa60b4471 --- /dev/null +++ b/kernel/arm/rot.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c new file mode 100644 index 000000000..d385c46bc --- /dev/null +++ b/kernel/arm/scal.c @@ -0,0 +1,58 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + + if ( n < 0 || inc_x < 1 ) return(0); + if ( da == 1.0 ) return(0); + + n *= inc_x; + while(i < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + + } + return(0); + +} + + diff --git a/kernel/arm/swap.c b/kernel/arm/swap.c new file mode 100644 index 000000000..1ca9e7607 --- /dev/null +++ b/kernel/arm/swap.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/08/20 Saar +* BLASTEST float OK +* BLASTEST double OK +* +**************************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zamax.c b/kernel/arm/zamax.c new file mode 100644 index 000000000..8c2a5c346 --- /dev/null +++ b/kernel/arm/zamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(maxf,0)); +} + + diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c new file mode 100644 index 000000000..6956ced0e --- /dev/null +++ b/kernel/arm/zamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(minf,0)); +} + + diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c new file mode 100644 index 000000000..13acfc0f0 --- /dev/null +++ b/kernel/arm/zasum.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + if (n < 0 || inc_x < 1 ) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c new file mode 100644 index 000000000..28a4380fb --- /dev/null +++ b/kernel/arm/zaxpy.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/15 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c new file mode 100644 index 000000000..654711240 --- /dev/null +++ b/kernel/arm/zcopy.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c new file mode 100644 index 000000000..096ced9db --- /dev/null +++ b/kernel/arm/zdot.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : FAIL +* BLASTEST double : FAIL +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + FLOAT _Complex result; + + dot[0]=0.0; + dot[1]=0.0; + + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + + if ( n < 1 ) return(result); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + __real__ result = dot[0]; + __imag__ result = dot[1]; + return(result); + +} + + diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c new file mode 100644 index 000000000..5f00c34f6 --- /dev/null +++ b/kernel/arm/zgemv_n.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** + * * 2013/09/15 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + if( alpha_r == 0.0 && alpha_i == 0.0 ) return(0); + + lda2 = 2*lda; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + ix = 0; + a_ptr = a; + +#if !defined(CONJ) + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/zrot.c b/kernel/arm/zrot.c new file mode 100644 index 000000000..4a2f37f64 --- /dev/null +++ b/kernel/arm/zrot.c @@ -0,0 +1,68 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n <= 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c new file mode 100644 index 000000000..833dc8c03 --- /dev/null +++ b/kernel/arm/zscal.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( n < 0 || inc_x < 1 ) return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + From d13788d1b4027d9e545694b7cad71a877bcab3dd Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 28 Sep 2013 19:10:32 +0200 Subject: [PATCH 011/127] common files modified for ARM --- Makefile.rule | 30 ++++----- Makefile.system | 37 +++-------- common.h | 9 ++- common_arm.h | 163 ++++++++++++++++++++++++++++++++++++++++++++++++ ctest.c | 6 ++ getarch.c | 15 +++++ param.h | 43 +++++++++++++ 7 files changed, 255 insertions(+), 48 deletions(-) create mode 100644 common_arm.h diff --git a/Makefile.rule b/Makefile.rule index e357d5ccc..a7aa0873d 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -12,7 +12,7 @@ VERSION = 0.2.8 # You can specify the target architecture, otherwise it's # automatically detected. -# TARGET = PENRYN +TARGET = ARMV7 # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 @@ -25,20 +25,20 @@ VERSION = 0.2.8 # FC = gfortran # Even you can specify cross compiler. Meanwhile, please set HOSTCC. -# CC = x86_64-w64-mingw32-gcc -# FC = x86_64-w64-mingw32-gfortran +CC = arm-linux-gnueabihf-gcc +FC = arm-linux-gnueabihf-gfortran # If you use the cross compiler, please set this host compiler. -# HOSTCC = gcc +HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 -# BINARY=64 +#BINARY=32 # About threaded BLAS. It will be automatically detected if you don't # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -# USE_THREAD = 0 +USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. # USE_OPENMP = 1 @@ -46,7 +46,7 @@ VERSION = 0.2.8 # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. -# NUM_THREADS = 24 +NUM_THREADS = 4 # if you don't need generate the shared library, please comment it in. # NO_SHARED = 1 @@ -54,16 +54,12 @@ VERSION = 0.2.8 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 -# If you only want CBLAS interface without installing Fortran compiler, -# please comment it in. -# ONLY_CBLAS = 1 - # If you don't need LAPACK, please comment it in. # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. -# NO_LAPACK = 1 +#NO_LAPACK = 1 # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. -# NO_LAPACKE = 1 +#NO_LAPACKE = 1 # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -76,10 +72,10 @@ VERSION = 0.2.8 # Unfortunately most of kernel won't give us high quality buffer. # BLAS tries to find the best region before entering main function, # but it will consume time. If you don't like it, you can disable one. -# NO_WARMUP = 1 +NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -# NO_AFFINITY = 1 +NO_AFFINITY = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. @@ -127,13 +123,13 @@ VERSION = 0.2.8 # Common Optimization Flag; # The default -O2 is enough. -# COMMON_OPT = -O2 +COMMON_OPT = -O0 -marm -mfpu=vfpv3 -fno-omit-frame-pointer # Profiling flags COMMON_PROF = -pg # Build Debug version -# DEBUG = 1 +DEBUG = 1 # # End of user configuration diff --git a/Makefile.system b/Makefile.system index 858160fc4..e5358f65b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -82,19 +82,12 @@ ifeq ($(HOSTCC), loongcc) GETARCH_FLAGS += -static endif -#if don't use Fortran, it will only compile CBLAS. -ifeq ($(ONLY_CBLAS), 1) -NO_LAPACK = 1 -else -ONLY_CBLAS = 0 -endif - # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -331,14 +324,16 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE +#BULLDOZER PILEDRIVER endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE +#BULLDOZER PILEDRIVER endif endif @@ -368,6 +363,10 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), arm) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif # # C Compiler dependent settings # @@ -892,23 +891,6 @@ LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) LIBS = $(TOPDIR)/$(LIBNAME) LIBS_P = $(TOPDIR)/$(LIBNAME_P) - -LIB_COMPONENTS = BLAS -ifneq ($(NO_CBLAS), 1) -LIB_COMPONENTS += CBLAS -endif - -ifneq ($(NO_LAPACK), 1) -LIB_COMPONENTS += LAPACK -ifneq ($(NO_LAPACKE), 1) -LIB_COMPONENTS += LAPACKE -endif -endif - -ifeq ($(ONLY_CBLAS), 1) -LIB_COMPONENTS = CBLAS -endif - export OSNAME export ARCH export CORE @@ -935,7 +917,6 @@ export USE_OPENMP export CROSS export CROSS_SUFFIX export NOFORTRAN -export NO_FBLAS export EXTRALIB export CEXTRALIB export FEXTRALIB diff --git a/common.h b/common.h index 309f246e2..418ed25f5 100644 --- a/common.h +++ b/common.h @@ -363,6 +363,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips64.h" #endif +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + #ifdef OS_LINUX #include "common_linux.h" #endif @@ -574,10 +578,9 @@ typedef struct { #include "common_level2.h" #include "common_level3.h" #include "common_lapack.h" - #ifdef CBLAS -# define OPENBLAS_CONST /* see comment in cblas.h */ -# include "cblas.h" +/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ +#include "cblas_noconst.h" #endif #ifndef ASSEMBLER diff --git a/common_arm.h b/common_arm.h new file mode 100644 index 000000000..b61efd7c1 --- /dev/null +++ b/common_arm.h @@ -0,0 +1,163 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM +#define COMMON_ARM + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void INLINE blas_lock(volatile unsigned long *address){ + +// long int ret, val = 1; +/* + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "1: ll %0, %3\n" + " ori %2, %0, 1\n" + " sc %2, %1\n" + " beqz %2, 1b\n" + " andi %2, %0, 1\n" + " sync\n" + : "=&r" (val), "=m" (address), "=&r" (ret) + : "m" (address) + : "memory"); + + } while (ret); +*/ +} + +static inline unsigned int rpcc(void){ + unsigned long ret=0; + + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/ctest.c b/ctest.c index 413519274..184416339 100644 --- a/ctest.c +++ b/ctest.c @@ -124,3 +124,9 @@ ARCH_IA64 #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) BINARY_64 #endif + +#if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__) +ARCH_ARM +#endif + + diff --git a/getarch.c b/getarch.c index 3ffda6244..3264a76f6 100644 --- a/getarch.c +++ b/getarch.c @@ -679,6 +679,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_ARMV7 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV7" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV7 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "armv7" +#define CORENAME "ARMV7" +#else +#endif + + #ifndef FORCE #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ diff --git a/param.h b/param.h index 0c3df6951..79c18f7e2 100644 --- a/param.h +++ b/param.h @@ -1793,6 +1793,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 24 +#define ZGEMM_DEFAULT_P 20 + +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 64 + +#define SGEMM_DEFAULT_R 512 +#define DGEMM_DEFAULT_R 2048 +#define CGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 512 + + + +#define SYMV_P 16 +#endif + + + #ifdef GENERIC #define SNUMOPT 2 From 69ce737cc5942831ab877ca88c410039ab997e8e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 28 Sep 2013 19:13:47 +0200 Subject: [PATCH 012/127] modified Makefile.L3 for ARM --- kernel/Makefile.L3 | 198 +++------------------------------------------ 1 file changed, 12 insertions(+), 186 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f8152ac50..f543cd08d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -14,6 +14,16 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(ARCH), arm) +USE_TRMM = 1 +endif + +ifeq ($(TARGET), LOONGSON3B) +USE_TRMM = 1 +endif + + + SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ @@ -498,7 +508,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ -ifeq ($(TARGET), LOONGSON3B) + +ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -582,24 +593,6 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - -ifdef STRMMKERNEL - -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - - else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -613,93 +606,17 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -else - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -endif - -ifdef QTRMMKERNEL - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -else $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -713,36 +630,6 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -ifdef CTRMMKERNEL - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -767,37 +654,6 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -ifdef ZTRMMKERNEL - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - - -else - $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -821,37 +677,10 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - endif -endif - -ifdef XTRMMKERNEL - -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -else $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -877,9 +706,6 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - - $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ From 4a474ea7dc21f68267442b89e1fa88c04d9187a0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 29 Sep 2013 17:46:23 +0200 Subject: [PATCH 013/127] changed dgemm_kernel to use fused multiply add --- kernel/arm/dgemm_kernel_8x2_vfpv3.S | 380 +++++++++------------------- 1 file changed, 115 insertions(+), 265 deletions(-) diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S index 3c474a172..e4b256832 100644 --- a/kernel/arm/dgemm_kernel_8x2_vfpv3.S +++ b/kernel/arm/dgemm_kernel_8x2_vfpv3.S @@ -26,43 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/09/28 Saar +* 2013/09/29 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/09/22 Saar -* UNROLL_N 2 -* UNROLL_M 8 -* DGEMM_P 64 -* DGEMM_Q 64 -* DGEMM_R 512 -* A_PRE 128 -* B_PRE 128 -* -* Performance on Odroid U2: -* -* 1 Core: 0.92 GFLOPS ATLAS: 0.81 GFLOPS -* 2 Cores: 1.83 GFLOPS ATLAS: 1.51 GFLOPS -* 3 Cores: 2.67 GFLOPS ATLAS: 1.51 GFLOPS -* 4 Cores: 3.52 GFLOPS ATLAS: 1.51 GFLOPS * -* 2013/09/28 Saar +* 2013/09/29 Saar * UNROLL_N 2 * UNROLL_M 8 * DGEMM_P 128 * DGEMM_Q 128 * DGEMM_R 2048 -* A_PRE 128 -* B_PRE 128 -* C_PRE 32 +* A_PRE 192 +* B_PRE 32 +* C_PRE 64 * * Performance on Odroid U2: * -* 1 Core: 0.99 GFLOPS ATLAS: 0.82 GFLOPS -* 2 Cores: 1.97 GFLOPS ATLAS: 1.59 GFLOPS -* 3 Cores: 2.86 GFLOPS ATLAS: 1.59 GFLOPS -* 4 Cores: 3.79 GFLOPS ATLAS: 1.59 GFLOPS +* 1 Core: 1.48 GFLOPS ATLAS: 1.52 GFLOPS +* 2 Cores: 2.92 GFLOPS ATLAS: - GFLOPS +* 3 Cores: 4.08 GFLOPS ATLAS: - GFLOPS +* 4 Cores: 4.80 GFLOPS ATLAS: 3.80 GFLOPS **************************************************************************************/ #define ASSEMBLER @@ -108,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K1 r7 #define BC r12 -#define A_PRE 128 -#define A_PRE1 160 -#define B_PRE 128 -#define C_PRE 32 +#define A_PRE 192 +#define A_PRE1 224 +#define B_PRE 32 +#define C_PRE 64 /************************************************************************************** * Macro definitions @@ -138,258 +123,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm -.macro KERNEL8x2_START - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - -.macro KERNEL8x2_M - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 +.macro KERNEL8x2 + fldmiad BO!, { d24 , d25} pld [AO , #A_PRE] + fldmiad AO!, { d0, d1 } - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - + fmacd d8 , d0, d24 + fldmiad AO!, { d2, d3 } + fmacd d9 , d1, d24 + fldmiad AO!, { d4, d5 } + fmacd d16 , d0, d25 + fldmiad AO!, { d6, d7 } + fmacd d17 , d1, d25 -.macro KERNEL8x2_END + fmacd d10 , d2, d24 + fmacd d11 , d3, d24 + fmacd d18 , d2, d25 + fmacd d19 , d3, d25 - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 + pld [AO , #A_PRE-32] - pld [AO , #A_PRE] + fmacd d12 , d4, d24 + fmacd d13 , d5, d24 + fmacd d20 , d4, d25 + fmacd d21 , d5, d25 - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - - - -.macro KERNEL8x2 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 + fmacd d14 , d6, d24 + fmacd d15 , d7, d24 + fmacd d22 , d6, d25 + fmacd d23 , d7, d25 .endm .macro SAVE8x2 vldr d0, ALPHA - vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 + vldm CO1, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + + vmla.f64 d24, d0 , d8 + vmla.f64 d25, d0 , d9 + vmla.f64 d26, d0 , d10 + vmla.f64 d27, d0 , d11 + vmla.f64 d28, d0 , d12 + vmla.f64 d29, d0 , d13 + vmla.f64 d30, d0 , d14 + vmla.f64 d31, d0 , d15 + + vstm CO1!, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + vldm CO2, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - vmul.f64 d16, d0 , d16 - vmul.f64 d17, d0 , d17 - vmul.f64 d18, d0 , d18 - vmul.f64 d19, d0 , d19 - vmul.f64 d20, d0 , d20 - vmul.f64 d21, d0 , d21 - vmul.f64 d22, d0 , d22 - vmul.f64 d23, d0 , d23 - - vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } - - vadd.f64 d16, d16, d24 - vadd.f64 d17, d17, d25 - vadd.f64 d18, d18, d26 - vadd.f64 d19, d19, d27 - - vadd.f64 d20, d20, d28 - vadd.f64 d21, d21, d29 - vadd.f64 d22, d22, d30 - vadd.f64 d23, d23, d31 + vmla.f64 d8 , d0 , d16 + vmla.f64 d9 , d0 , d17 + vmla.f64 d10, d0 , d18 + vmla.f64 d11, d0 , d19 + vmla.f64 d12, d0 , d20 + vmla.f64 d13, d0 , d21 + vmla.f64 d14, d0 , d22 + vmla.f64 d15, d0 , d23 - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 + vstm CO2!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 +.endm - vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } +.macro SAVE8x2_BAD + + vldr d0, ALPHA + vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vmul.f64 d16, d0 , d16 + vmul.f64 d17, d0 , d17 + vmul.f64 d18, d0 , d18 + vmul.f64 d19, d0 , d19 + vmul.f64 d20, d0 , d20 + vmul.f64 d21, d0 , d21 + vmul.f64 d22, d0 , d22 + vmul.f64 d23, d0 , d23 + + vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } + + vadd.f64 d16, d16, d24 + vadd.f64 d17, d17, d25 + vadd.f64 d18, d18, d26 + vadd.f64 d19, d19, d27 + + vadd.f64 d20, d20, d28 + vadd.f64 d21, d21, d29 + vadd.f64 d22, d22, d30 + vadd.f64 d23, d23, d31 + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } .endm + /*************************************************************************************/ @@ -814,18 +664,18 @@ _L2_M8_20: _L2_M8_22: pld [BO , #B_PRE] - KERNEL8x2_START - KERNEL8x2_M + KERNEL8x2 + KERNEL8x2 pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M + KERNEL8x2 + KERNEL8x2 pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M + KERNEL8x2 + KERNEL8x2 pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_END + KERNEL8x2 + KERNEL8x2 subs L, L, #1 bgt _L2_M8_22 From 9965d4800585467b512e2a972f326a0ff133835e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 29 Sep 2013 18:55:21 +0200 Subject: [PATCH 014/127] added Makefile.arm --- Makefile.arm | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Makefile.arm diff --git a/Makefile.arm b/Makefile.arm new file mode 100644 index 000000000..05ea9c679 --- /dev/null +++ b/Makefile.arm @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif From 22a8fcc4b7d4c6feb433229f75316449b8e97fdb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 29 Sep 2013 19:42:33 +0200 Subject: [PATCH 015/127] add modified c_check perl program --- c_check | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c_check b/c_check index d5fe59f75..c1cdd59c4 100644 --- a/c_check +++ b/c_check @@ -63,6 +63,7 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); $defined = 0; @@ -149,6 +150,7 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); From 1c63180bb6820fce44ab81b62cc056dd9d35ca9d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 30 Sep 2013 17:31:23 +0200 Subject: [PATCH 016/127] updated dgemm_kernel_8x2_vfpv3.S --- kernel/arm/dgemm_kernel_8x2_vfpv3.S | 127 ++++++++++++++++++---------- 1 file changed, 83 insertions(+), 44 deletions(-) diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S index e4b256832..6c1b0f5fd 100644 --- a/kernel/arm/dgemm_kernel_8x2_vfpv3.S +++ b/kernel/arm/dgemm_kernel_8x2_vfpv3.S @@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/09/29 Saar +* 2013/09/30 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/09/29 Saar +* 2013/09/30 Saar * UNROLL_N 2 * UNROLL_M 8 -* DGEMM_P 128 -* DGEMM_Q 128 -* DGEMM_R 2048 +* DGEMM_P 64 +* DGEMM_Q 64 +* DGEMM_R 512 * A_PRE 192 * B_PRE 32 * C_PRE 64 * * Performance on Odroid U2: * -* 1 Core: 1.48 GFLOPS ATLAS: 1.52 GFLOPS -* 2 Cores: 2.92 GFLOPS ATLAS: - GFLOPS -* 3 Cores: 4.08 GFLOPS ATLAS: - GFLOPS -* 4 Cores: 4.80 GFLOPS ATLAS: 3.80 GFLOPS +* 1 Core: 1.42 GFLOPS ATLAS: 1.58 GFLOPS +* 2 Cores: 2.81 GFLOPS ATLAS: - GFLOPS +* 3 Cores: 4.05 GFLOPS ATLAS: - GFLOPS +* 4 Cores: 5.40 GFLOPS ATLAS: 3.88 GFLOPS **************************************************************************************/ #define ASSEMBLER @@ -128,32 +128,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x2 fldmiad BO!, { d24 , d25} - pld [AO , #A_PRE] - fldmiad AO!, { d0, d1 } - + fldd d0, [ AO ] fmacd d8 , d0, d24 - fldmiad AO!, { d2, d3 } - fmacd d9 , d1, d24 - fldmiad AO!, { d4, d5 } + fldd d1, [ AO , #8 ] fmacd d16 , d0, d25 - fldmiad AO!, { d6, d7 } + fldd d2, [ AO , #16 ] + fmacd d9 , d1, d24 fmacd d17 , d1, d25 - + fldd d3, [ AO , #24 ] fmacd d10 , d2, d24 - fmacd d11 , d3, d24 fmacd d18 , d2, d25 + fldd d4, [ AO , #32 ] + fmacd d11 , d3, d24 + pld [AO , #A_PRE] fmacd d19 , d3, d25 - - pld [AO , #A_PRE-32] + fldd d5, [ AO , #40 ] fmacd d12 , d4, d24 - fmacd d13 , d5, d24 fmacd d20 , d4, d25 + fldd d6, [ AO , #48 ] + fmacd d13 , d5, d24 fmacd d21 , d5, d25 + fldd d7, [ AO , #56 ] fmacd d14 , d6, d24 - fmacd d15 , d7, d24 fmacd d22 , d6, d25 + pld [AO , #A_PRE+32] + fmacd d15 , d7, d24 + add AO, AO, #64 fmacd d23 , d7, d25 .endm @@ -161,30 +163,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE8x2 vldr d0, ALPHA - vldm CO1, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - - vmla.f64 d24, d0 , d8 - vmla.f64 d25, d0 , d9 - vmla.f64 d26, d0 , d10 - vmla.f64 d27, d0 , d11 - vmla.f64 d28, d0 , d12 - vmla.f64 d29, d0 , d13 - vmla.f64 d30, d0 , d14 - vmla.f64 d31, d0 , d15 - - vstm CO1!, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - vldm CO2, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } + + fldd d24, [CO1] + fldd d25, [CO1, #8 ] + + fmacd d24, d0 , d8 + fldd d8 , [CO2] + fldd d26, [CO1, #16] + fmacd d25, d0 , d9 + fldd d9 , [CO2, #8 ] + fldd d27, [CO1, #24] + fmacd d26, d0 , d10 + fldd d10 , [CO2, #16 ] + fldd d28, [CO1, #32] + fmacd d27, d0 , d11 + fldd d11 , [CO2, #24 ] + fldd d29, [CO1, #40] + fmacd d28, d0 , d12 + fldd d12 , [CO2, #32 ] + fldd d30, [CO1, #48] + fmacd d29, d0 , d13 + fldd d13 , [CO2, #40 ] + fldd d31, [CO1, #56] + fmacd d30, d0 , d14 + fldd d14 , [CO2, #48 ] + fmacd d31, d0 , d15 + fldd d15 , [CO2, #56 ] + - vmla.f64 d8 , d0 , d16 - vmla.f64 d9 , d0 , d17 - vmla.f64 d10, d0 , d18 - vmla.f64 d11, d0 , d19 - vmla.f64 d12, d0 , d20 - vmla.f64 d13, d0 , d21 - vmla.f64 d14, d0 , d22 - vmla.f64 d15, d0 , d23 + fmacd d8 , d0 , d16 + fstd d24, [CO1] + fmacd d9 , d0 , d17 + fstd d25, [CO1, #8 ] + fstd d8 , [CO2] + fmacd d10, d0 , d18 + fstd d26, [CO1, #16 ] + fstd d9 , [CO2, #8 ] + fmacd d11, d0 , d19 + fstd d27, [CO1, #24 ] + fstd d10, [CO2, #16 ] + fmacd d12, d0 , d20 + fstd d28, [CO1, #32 ] + fstd d11, [CO2, #24 ] + fmacd d13, d0 , d21 + fstd d29, [CO1, #40 ] + fstd d12, [CO2, #32 ] + fmacd d14, d0 , d22 + fstd d30, [CO1, #48 ] + fstd d13, [CO2, #40 ] + fmacd d15, d0 , d23 + fstd d31, [CO1, #56 ] + fstd d14, [CO2, #48 ] + + add CO1, CO1, #64 + fstd d15, [CO2, #56 ] + add CO2, CO2, #64 - vstm CO2!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } .endm @@ -643,6 +677,9 @@ _L2_BEGIN: str r3 , C // store C ldr AO, A // AO = A + pld [AO , #A_PRE-96] + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] _L2_M8_BEGIN: @@ -653,7 +690,9 @@ _L2_M8_BEGIN: _L2_M8_20: pld [CO1, #C_PRE] + pld [CO1, #C_PRE+32] pld [CO2, #C_PRE] + pld [CO2, #C_PRE+32] INIT8x2 mov BO, BC From 93f1074dd4100835962d10621a7fcd2fe63e5b0e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 30 Sep 2013 18:03:56 +0200 Subject: [PATCH 017/127] changed some values for arm --- param.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/param.h b/param.h index 79c18f7e2..a2ed09d6f 100644 --- a/param.h +++ b/param.h @@ -1815,17 +1815,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 64 #define CGEMM_DEFAULT_P 24 #define ZGEMM_DEFAULT_P 20 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 64 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 #define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 2048 +#define DGEMM_DEFAULT_R 512 #define CGEMM_DEFAULT_R 512 #define ZGEMM_DEFAULT_R 512 From e0b968c3a73ab797211789def5873d03b08ee9a4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 5 Oct 2013 12:59:44 +0200 Subject: [PATCH 018/127] Changed kernels for dgemm and dtrmm --- kernel/arm/KERNEL.ARMV7 | 16 +- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 1589 ++++++++++++++++++++++ kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 1953 +++++++++++++++++++++++++++ param.h | 8 +- 4 files changed, 3554 insertions(+), 12 deletions(-) create mode 100644 kernel/arm/dgemm_kernel_4x4_vfpv3.S create mode 100644 kernel/arm/dtrmm_kernel_4x4_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index a60701002..8c69ad5cf 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -81,7 +81,7 @@ CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = dtrmm_kernel_8x2_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c @@ -93,13 +93,13 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o #DGEMMKERNEL = ../generic/gemmkernel_2x2.c #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S -DGEMMKERNEL = dgemm_kernel_8x2_vfpv3.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..4c30e108c --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1589 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/03 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 96 +* DGEMM_R 512 +* A_PRE 64 +* B_PRE 64 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.55 GFLOPS ATLAS: 1.59 GFLOPS +* 2 Cores: 3.10 GFLOPS ATLAS: - GFLOPS +* 3 Cores: 4.54 GFLOPS ATLAS: - GFLOPS +* 4 Cores: 5.67 GFLOPS ATLAS: 3.88 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmuld d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmuld d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmuld d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmuld d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmuld d20 , d0, d9 + fmuld d21 , d1, d9 + add BO , BO, #32 + fmuld d22 , d2, d9 + + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + fmuld d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmuld d25 , d1, d10 + fmuld d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_S + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmacd d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmacd d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmacd d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmacd d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + add BO , BO, #32 + fmacd d22 , d2, d9 + + fldd d12, [ BO ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_M1 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fldd d8 , [ BO ] + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fldmiad AO!, { d0 - d1 } + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fldmiad AO!, { d2 - d3 } + fmacd d25 , d5, d14 + fldd d9 , [ BO, #8 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #16 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #24 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + add BO , BO, #32 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M2 + + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + pld [ BO , #B_PRE ] + fmacd d18 , d2, d8 + fldd d12, [ BO ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fldmiad AO!, { d4 - d5 } + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fldmiad AO!, { d6 - d7 } + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fldd d9 , [ BO, #8 ] + fmacd d17 , d1, d8 + fldd d10, [ BO, #16 ] + fmacd d18 , d2, d8 + fldd d11, [ BO, #24 ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + pld [ CO2 , #C_PRE ] + + fldmiad CO1, { d8 - d11 } + pld [ r4 , #C_PRE ] + + fmacd d8 , d0 , d16 + fldd d12, [CO2] + fmacd d9 , d0 , d17 + fldd d13, [CO2, #8 ] + fmacd d10, d0 , d18 + fldd d14, [CO2, #16 ] + fmacd d11, d0 , d19 + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fstd d8 , [CO1] + fmacd d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmacd d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmacd d15, d0 , d23 + fstd d11, [CO1, #24 ] + + fldmiad r4, { d8 - d11 } + + fmacd d8 , d0 , d24 + fstd d12, [CO2] + fmacd d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmacd d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmacd d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + pld [ CO2 , #C_PRE ] + + fldmiad CO2, { d12 - d15 } + + fstd d8 , [r4 ] + fmacd d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmacd d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmacd d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmacd d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fldd d8 , [r4 ] + fldd d9 , [r4 , #8 ] + + fmacd d8 , d0 , d24 + fmacd d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d28 + fmacd d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + fldd d8 , [r4 ] + fmacd d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fmacd d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + fldd d14, [CO2, #16 ] + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + fmacd d14, d0 , d22 + fmacd d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + + + mov BO, BC + asrs L , K1, #5 // L = L / 32 + ble _L4_M4_40 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + +_L4_M4_22: + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + + b _L4_M4_22 + + +_L4_M4_40: + + INIT4x4 + +_L4_M4_41: + + tst K1, #31 + ble _L4_M4_100 + + tst K1, #16 + ble _L4_M4_44 + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + +_L4_M4_44: + + ands L , K1, #15 // L = L % 16 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bgt _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +_L4_M4_END: + + subs I, I, #1 + bgt _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + + +_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..a80177f8b --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1953 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmuld d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmuld d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmuld d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmuld d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmuld d20 , d0, d9 + fmuld d21 , d1, d9 + add BO , BO, #32 + fmuld d22 , d2, d9 + + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + fmuld d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmuld d25 , d1, d10 + fmuld d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_S + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmacd d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmacd d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmacd d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmacd d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + add BO , BO, #32 + fmacd d22 , d2, d9 + + fldd d12, [ BO ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_M1 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fldd d8 , [ BO ] + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fldmiad AO!, { d0 - d1 } + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fldmiad AO!, { d2 - d3 } + fmacd d25 , d5, d14 + fldd d9 , [ BO, #8 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #16 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #24 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + add BO , BO, #32 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M2 + + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + pld [ BO , #B_PRE ] + fmacd d18 , d2, d8 + fldd d12, [ BO ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fldmiad AO!, { d4 - d5 } + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fldmiad AO!, { d6 - d7 } + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fldd d9 , [ BO, #8 ] + fmacd d17 , d1, d8 + fldd d10, [ BO, #16 ] + fmacd d18 , d2, d8 + fldd d11, [ BO, #24 ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fmuld d12, d0 , d20 + fstd d8 , [CO1] + fmuld d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmuld d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmuld d15, d0 , d23 + fstd d11, [CO1, #24 ] + + + fmuld d8 , d0 , d24 + fstd d12, [CO2] + fmuld d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmuld d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmuld d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + fstd d8 , [r4 ] + fmuld d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmuld d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmuld d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmuld d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fmuld d8 , d0 , d24 + fmuld d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fmuld d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + fmuld d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + fmuld d14, d0 , d22 + fmuld d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #5 // L = L / 8 + ble _L4_M4_40 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + +_L4_M4_22: + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + + b _L4_M4_22 + + +_L4_M4_40: + + INIT4x4 + +_L4_M4_41: + + ands L , K1, #31 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_42: + + KERNEL4x4_SUB + + subs L, L, #1 + bgt _L4_M4_42 + +_L4_M4_100: + + SAVE4x4 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + +_L4_M4_END: + + subs I, I, #1 + bgt _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index a2ed09d6f..cf1665438 100644 --- a/param.h +++ b/param.h @@ -1805,8 +1805,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -1815,12 +1815,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 24 #define ZGEMM_DEFAULT_P 20 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 64 +#define DGEMM_DEFAULT_Q 96 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 From 31f51e78bc4a0c69ab5a1f968eaa43f403288edd Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 12 Oct 2013 09:42:18 +0200 Subject: [PATCH 019/127] minor optimizations on dgemm_kernel --- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 263 ++++++++++------------------ 1 file changed, 96 insertions(+), 167 deletions(-) diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 4c30e108c..dfe3e3634 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -26,34 +26,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/05 Saar +* 2013/10/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/03 Saar +* 2013/10/11 Saar * UNROLL_N 4 * UNROLL_M 4 * DGEMM_P 128 * DGEMM_Q 96 * DGEMM_R 512 -* A_PRE 64 -* B_PRE 64 +* A_PRE 96 +* B_PRE 96 * C_PRE 64 * * Performance on Odroid U2: * -* 1 Core: 1.55 GFLOPS ATLAS: 1.59 GFLOPS -* 2 Cores: 3.10 GFLOPS ATLAS: - GFLOPS -* 3 Cores: 4.54 GFLOPS ATLAS: - GFLOPS -* 4 Cores: 5.67 GFLOPS ATLAS: 3.88 GFLOPS +* 1 Core: 1.57 GFLOPS ATLAS: 1.59 GFLOPS +* 2 Cores: 3.14 GFLOPS ATLAS: 3.16 GFLOPS +* 3 Cores: 4.56 GFLOPS ATLAS: 4.60 GFLOPS +* 4 Cores: 5.82 GFLOPS ATLAS: 5.41 GFLOPS **************************************************************************************/ #define ASSEMBLER #include "common.h" -#define STACKSIZE 252 +#define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 @@ -67,17 +67,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define C [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define A [fp, #-268 ] -#define ALPHA [fp, #-276 ] +#define ALPHA [fp, #-280] #define B [fp, #4 ] -#define OLD_C [fp, #8 ] +#define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define I r0 @@ -93,8 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K1 r7 #define BC r12 -#define A_PRE 64 -#define B_PRE 64 +#define A_PRE 96 +#define B_PRE 96 #define C_PRE 64 /************************************************************************************** @@ -124,34 +123,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I pld [ BO , #B_PRE ] - fldd d8 , [ BO ] - + fldd d0 , [ AO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + fldd d1 , [ AO, #8 ] fmuld d16 , d0, d8 - fldmiad AO!, { d2 - d3} + fldd d2 , [ AO, #16 ] fmuld d17 , d1, d8 - fldd d9 , [ BO, #8 ] + fldd d3 , [ AO, #24 ] fmuld d18 , d2, d8 - fldd d10, [ BO, #16 ] + fldd d9 , [ BO, #8 ] fmuld d19 , d3, d8 - fldd d11, [ BO, #24 ] + fldd d10, [ BO, #16 ] fmuld d20 , d0, d9 + fldd d11, [ BO, #24 ] fmuld d21 , d1, d9 add BO , BO, #32 + add AO , AO, #32 fmuld d22 , d2, d9 + pld [ BO , #B_PRE ] fldd d12, [ BO ] fmuld d23 , d3, d9 + pld [ AO , #A_PRE ] + fldd d4 , [ AO, #0 ] fmuld d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + fldd d5 , [ AO, #8 ] fmuld d25 , d1, d10 + fldd d6 , [ AO, #16 ] fmuld d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + fldd d7 , [ AO, #24 ] fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -161,132 +165,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d15, [ BO, #24 ] fmuld d30 , d2, d11 fmuld d31 , d3, d11 - add BO , BO, #32 .endm -.macro KERNEL4x4_S - pld [ BO , #B_PRE ] - - fldd d8 , [ BO ] - - pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} - - fmacd d16 , d0, d8 - fldmiad AO!, { d2 - d3} - fmacd d17 , d1, d8 - fldd d9 , [ BO, #8 ] - fmacd d18 , d2, d8 - fldd d10, [ BO, #16 ] - fmacd d19 , d3, d8 - - fldd d11, [ BO, #24 ] - fmacd d20 , d0, d9 - fmacd d21 , d1, d9 - add BO , BO, #32 - fmacd d22 , d2, d9 - - fldd d12, [ BO ] - fmacd d23 , d3, d9 - - fmacd d24 , d0, d10 - fldmiad AO!, { d4 - d5 } - fmacd d25 , d1, d10 - fmacd d26 , d2, d10 - fldmiad AO!, { d6 - d7 } - fmacd d27 , d3, d10 - - fldd d13, [ BO, #8 ] - fmacd d28 , d0, d11 - fldd d14, [ BO, #16 ] - fmacd d29 , d1, d11 - fldd d15, [ BO, #24 ] - fmacd d30 , d2, d11 - fmacd d31 , d3, d11 - add BO , BO, #32 - -.endm - - - -.macro KERNEL4x4_M1 +.macro KERNEL4x4_M2 fmacd d16 , d4, d12 - pld [ AO , #A_PRE ] + pld [ AO , #A_PRE+32 ] fmacd d17 , d5, d12 + fldd d0 , [ AO , #32 ] fmacd d18 , d6, d12 - pld [ BO , #B_PRE ] + pld [ BO , #B_PRE+32 ] fmacd d19 , d7, d12 + fldd d8 , [ BO , #32 ] fmacd d20 , d4, d13 - fldd d8 , [ BO ] + fldd d1 , [ AO, #40 ] fmacd d21 , d5, d13 + fldd d2 , [ AO, #48 ] fmacd d22 , d6, d13 - fldmiad AO!, { d0 - d1 } + fldd d3 , [ AO, #56 ] fmacd d23 , d7, d13 fmacd d24 , d4, d14 - fldmiad AO!, { d2 - d3 } fmacd d25 , d5, d14 - fldd d9 , [ BO, #8 ] + fldd d9 , [ BO, #40 ] fmacd d26 , d6, d14 - fldd d10, [ BO, #16 ] + fldd d10, [ BO, #48 ] fmacd d27 , d7, d14 - fldd d11, [ BO, #24 ] + fldd d11, [ BO, #56 ] fmacd d28 , d4, d15 fmacd d29 , d5, d15 + add AO , AO, #64 fmacd d30 , d6, d15 - add BO , BO, #32 + add BO , BO, #64 fmacd d31 , d7, d15 .endm -.macro KERNEL4x4_M2 +.macro KERNEL4x4_M1 fmacd d16 , d0, d8 pld [ AO , #A_PRE ] fmacd d17 , d1, d8 - pld [ BO , #B_PRE ] + fldd d4 , [ AO ] fmacd d18 , d2, d8 - fldd d12, [ BO ] + pld [ BO , #B_PRE ] fmacd d19 , d3, d8 + fldd d12, [ BO ] fmacd d20 , d0, d9 - fldmiad AO!, { d4 - d5 } + fldd d5 , [ AO, #8 ] fmacd d21 , d1, d9 + fldd d6 , [ AO, #16 ] fmacd d22 , d2, d9 - fldmiad AO!, { d6 - d7 } + fldd d7 , [ AO, #24 ] fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 + fldd d13, [ BO, #8 ] fmacd d26 , d2, d10 + fldd d14, [ BO, #16 ] fmacd d27 , d3, d10 - fldd d13, [ BO, #8 ] + fldd d15, [ BO, #24 ] fmacd d28 , d0, d11 - fldd d14, [ BO, #16 ] fmacd d29 , d1, d11 - fldd d15, [ BO, #24 ] fmacd d30 , d2, d11 fmacd d31 , d3, d11 - add BO , BO, #32 .endm + .macro KERNEL4x4_E fmacd d16 , d4, d12 - pld [ AO , #A_PRE ] fmacd d17 , d5, d12 + add BO , BO, #32 + add AO , AO, #32 fmacd d18 , d6, d12 - pld [ BO , #B_PRE ] fmacd d19 , d7, d12 fmacd d20 , d4, d13 @@ -310,25 +273,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB - pld [ BO , #B_PRE ] - pld [ AO , #A_PRE ] fldd d8 , [ BO ] + pld [ BO , #B_PRE ] fldd d0 , [ AO ] + pld [ AO , #A_PRE ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fmacd d16 , d0, d8 - fldd d9 , [ BO, #8 ] + fldd d2 , [ AO, #16 ] fmacd d17 , d1, d8 - fldd d10, [ BO, #16 ] + fldd d3 , [ AO, #24 ] fmacd d18 , d2, d8 - fldd d11, [ BO, #24 ] + fldd d9 , [ BO, #8 ] fmacd d19 , d3, d8 + fldd d10, [ BO, #16 ] fmacd d20 , d0, d9 + fldd d11, [ BO, #24 ] fmacd d21 , d1, d9 fmacd d22 , d2, d9 fmacd d23 , d3, d9 @@ -924,9 +887,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC - ldr r3, OLD_C - str r3, C - ldr K1, K ldr BC, B @@ -958,140 +918,109 @@ _L4_M4_20: mov BO, BC - asrs L , K1, #5 // L = L / 32 - ble _L4_M4_40 + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 .align 5 KERNEL4x4_I - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 - KERNEL4x4_M1 + sub L, L, #2 + +_L4_M4_22: - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 - KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 - KERNEL4x4_M2 + + subs L, L, #1 + bgt _L4_M4_22 + KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 - KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E - subs L, L, #1 - ble _L4_M4_41 + b _L4_M4_44 -_L4_M4_22: + +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 - KERNEL4x4_S - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 + tst L, #2 + ble _L4_M4_32 + KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 - KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 + KERNEL4x4_M2 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 + KERNEL4x4_E - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 + b _L4_M4_44 +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E - subs L, L, #1 - ble _L4_M4_41 + b _L4_M4_44 - b _L4_M4_22 - _L4_M4_40: INIT4x4 -_L4_M4_41: - - tst K1, #31 - ble _L4_M4_100 - - tst K1, #16 - ble _L4_M4_44 - - KERNEL4x4_S - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_E _L4_M4_44: - ands L , K1, #15 // L = L % 16 + ands L , K1, #7 // L = L % 8 ble _L4_M4_100 _L4_M4_46: @@ -1099,7 +1028,7 @@ _L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bgt _L4_M4_46 + bne _L4_M4_46 _L4_M4_100: @@ -1108,7 +1037,7 @@ _L4_M4_100: _L4_M4_END: subs I, I, #1 - bgt _L4_M4_20 + bne _L4_M4_20 _L4_M2_BEGIN: From 2a1515c9dd561fe8c3dca9dcae9efc22a0c56644 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 12 Oct 2013 16:48:29 +0200 Subject: [PATCH 020/127] added dgemm_ncopy_4_vfpv3.S --- kernel/arm/dgemm_ncopy_4_vfpv3.S | 344 +++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 kernel/arm/dgemm_ncopy_4_vfpv3.S diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfpv3.S new file mode 100644 index 000000000..bdb63bfdd --- /dev/null +++ b/kernel/arm/dgemm_ncopy_4_vfpv3.S @@ -0,0 +1,344 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/11 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 96 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + fldd d2 , [ AO3, #0 ] + fldd d3 , [ AO4, #0 ] + + fldd d4 , [ AO1, #8 ] + fldd d8 , [ AO1, #16 ] + fldd d12, [ AO1, #24 ] + + fldd d5 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d9 , [ AO2, #16 ] + fldd d13, [ AO2, #24 ] + + fldd d6 , [ AO3, #8 ] + add AO2, AO2, #32 + fldd d10, [ AO3, #16 ] + fldd d14, [ AO3, #24 ] + + fldd d7 , [ AO4, #8 ] + add AO3, AO3, #32 + fldd d11, [ AO4, #16 ] + fldd d15, [ AO4, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #32 + fstmiad BO!, { d4 - d7 } + fstmiad BO!, { d8 - d15 } + +.endm + +.macro COPY1x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + fldd d2 , [ AO3, #0 ] + add AO2, AO2, #8 + fldd d3 , [ AO4, #0 ] + + add AO3, AO3, #8 + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #8 + +.endm + +.macro COPY4x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d6 , [ AO1, #24 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d5 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY4x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 8 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble _L2_BEGIN + +_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L4_M4_40 + +_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne _L4_M4_20 + + +_L4_M4_40: + + ands I, M , #3 + ble _L4_M4_END + +_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne _L4_M4_60 + + +_L4_M4_END: + + subs J , J, #1 // j-- + bne _L4_M4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + tst N, #3 + ble _L999 + + tst N, #2 + ble _L1_BEGIN + +_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L2_M4_40 + +_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne _L2_M4_20 + + +_L2_M4_40: + + ands I, M , #3 + ble _L2_M4_END + +_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne _L2_M4_60 + + +_L2_M4_END: + + +/*********************************************************************************************/ + +_L1_BEGIN: + + tst N, #1 + ble _L999 + + +_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L1_M4_40 + +_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne _L1_M4_20 + + +_L1_M4_40: + + ands I, M , #3 + ble _L1_M4_END + +_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne _L1_M4_60 + + +_L1_M4_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 3983011f0b00021f9663009192c89ce12dbac794 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 14 Oct 2013 08:22:27 +0200 Subject: [PATCH 021/127] added sgemm- and strmm_kernel --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 1491 +++++++++++++++++++++ kernel/arm/strmm_kernel_4x4_vfpv3.S | 1884 +++++++++++++++++++++++++++ 2 files changed, 3375 insertions(+) create mode 100644 kernel/arm/sgemm_kernel_4x4_vfpv3.S create mode 100644 kernel/arm/strmm_kernel_4x4_vfpv3.S diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..4746a587e --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1491 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/13 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 240 +* DGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS +* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS +* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS +* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + fldmias AO!, { s0 - s1 } + pld [ AO , #A_PRE-8 ] + fldmias BO!, { s8 - s9 } + pld [ BO , #B_PRE-8 ] + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s1 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias AO!, { s2 - s3 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fldmias BO!, { s8 - s9 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + pld [ BO , #B_PRE ] + + flds s0 , [ AO ] + pld [ AO , #A_PRE ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + pld [ CO2 , #C_PRE ] + + fldmias CO1, { s8 - s11 } + pld [ r4 , #C_PRE ] + + fmacs s8 , s0 , s16 + flds s12, [CO2] + fmacs s9 , s0 , s17 + flds s13, [CO2, #4 ] + fmacs s10, s0 , s18 + flds s14, [CO2, #8 ] + fmacs s11, s0 , s19 + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fsts s8 , [CO1] + fmacs s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmacs s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmacs s15, s0 , s23 + fsts s11, [CO1, #12 ] + + fldmias r4, { s8 - s11 } + + fmacs s8 , s0 , s24 + fsts s12, [CO2] + fmacs s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmacs s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmacs s11, s0 , s27 + fsts s15, [CO2, #12 ] + + add CO2, r4 , r3 + + pld [ CO2 , #C_PRE ] + + fldmias CO2, { s12 - s15 } + + fsts s8 , [r4 ] + fmacs s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmacs s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmacs s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmacs s15, s0 , s31 + + fstmias CO2, { s12 - s15 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + flds s8 , [r4 ] + flds s9 , [r4 , #4 ] + + fmacs s8 , s0 , s24 + fmacs s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s28 + fmacs s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + flds s8 , [r4 ] + fmacs s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + fmacs s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + flds s14, [CO2, #8 ] + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + fmacs s14, s0 , s22 + fmacs s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + sub L, L, #2 + +_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt _L4_M4_22 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 + + tst L, #2 + ble _L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_40: + + INIT4x4 + + +_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +_L4_M4_END: + + subs I, I, #1 + bne _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + + +_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..15c866856 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1884 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-32] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-244 ] +#define KKK [fp, #-248] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + fldmias AO!, { s0 - s1 } + pld [ AO , #A_PRE-8 ] + fldmias BO!, { s8 - s9 } + pld [ BO , #B_PRE-8 ] + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s1 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias AO!, { s2 - s3 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fldmias BO!, { s8 - s9 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + pld [ BO , #B_PRE ] + + flds s0 , [ AO ] + pld [ AO , #A_PRE ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fmuls s12, s0 , s20 + fsts s8 , [CO1] + fmuls s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmuls s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmuls s15, s0 , s23 + fsts s11, [CO1, #12 ] + + + fmuls s8 , s0 , s24 + fsts s12, [CO2] + fmuls s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmuls s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmuls s11, s0 , s27 + fsts s15, [CO2, #12 ] + + add CO2, r4 , r3 + + fsts s8 , [r4 ] + fmuls s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmuls s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmuls s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmuls s15, s0 , s31 + + fstmias CO2, { s12 - s15 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + + fmuls s8 , s0 , s24 + fmuls s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fmuls s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + fmuls s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + fmuls s14, s0 , s22 + fmuls s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + sub L, L, #2 + +_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt _L4_M4_22 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 + + tst L, #2 + ble _L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_40: + + INIT4x4 + + +_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L4_M4_END: + + subs I, I, #1 + bne _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 85484a42df31257592161ebac7cda80f25133547 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 16 Oct 2013 18:00:41 +0200 Subject: [PATCH 022/127] added kernels for cgemm, ctrmm, zgemm and ztrmm --- kernel/arm/KERNEL.ARMV7 | 37 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 1293 ++++++++++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 1476 +++++++++++++++++++++++++ kernel/arm/zgemm_kernel_2x2_vfpv3.S | 1329 +++++++++++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 1538 +++++++++++++++++++++++++++ 5 files changed, 5657 insertions(+), 16 deletions(-) create mode 100644 kernel/arm/cgemm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/ctrmm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/zgemm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/ztrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 8c69ad5cf..43153798e 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -80,36 +80,41 @@ DGEMVTKERNEL = gemv_t.c CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S + +#SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o #DGEMMKERNEL = ../generic/gemmkernel_2x2.c #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMINCOPY = -DGEMMITCOPY = -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = +DGEMMINCOPY = dgemm_ncopy_4_vfpv3.S +DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..abbbac831 --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1293 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + fldmias CO2, { s8 - s11 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + fldmias CO2, { s8 - s9 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..28e555caa --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1476 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..4b01f0429 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1329 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + fldmiad CO2, { d8 - d11 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + fldmiad CO2, { d8 - d9 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..917ce610f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1538 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 04391e6d9c99fec8d6314ef24550d0f2f9f836c1 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 16 Oct 2013 18:04:34 +0200 Subject: [PATCH 023/127] optimized param.h --- param.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/param.h b/param.h index cf1665438..8d9d0fc47 100644 --- a/param.h +++ b/param.h @@ -1802,8 +1802,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 64 +#define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 24 #define ZGEMM_DEFAULT_P 20 -#define SGEMM_DEFAULT_Q 192 +#define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 96 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 -#define SGEMM_DEFAULT_R 512 +#define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 512 #define CGEMM_DEFAULT_R 512 #define ZGEMM_DEFAULT_R 512 From 2d49db2f5bebbd727cb860cff023705ecd4bfda3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 16 Oct 2013 19:04:42 +0200 Subject: [PATCH 024/127] moved compiler flags from Makefile.rule to Makefile.arm --- Makefile.arm | 8 ++++++-- Makefile.rule | 14 +++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index 05ea9c679..6cdeb2f75 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,3 +1,7 @@ -ifdef BINARY64 -else + +ifeq ($(CORE), ARMV7) +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard endif + + diff --git a/Makefile.rule b/Makefile.rule index a7aa0873d..534f4d1a2 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -19,14 +19,14 @@ TARGET = ARMV7 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. -# CC = gcc +CC = gcc # Fortran compiler. Default is g77. -# FC = gfortran +FC = gfortran # Even you can specify cross compiler. Meanwhile, please set HOSTCC. -CC = arm-linux-gnueabihf-gcc -FC = arm-linux-gnueabihf-gfortran +#CC = arm-linux-gnueabihf-gcc +#FC = arm-linux-gnueabihf-gfortran # If you use the cross compiler, please set this host compiler. HOSTCC = gcc @@ -38,7 +38,7 @@ HOSTCC = gcc # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -USE_THREAD = 0 +#USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. # USE_OPENMP = 1 @@ -46,7 +46,7 @@ USE_THREAD = 0 # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. -NUM_THREADS = 4 +NUM_THREADS = 16 # if you don't need generate the shared library, please comment it in. # NO_SHARED = 1 @@ -123,7 +123,7 @@ NO_AFFINITY = 1 # Common Optimization Flag; # The default -O2 is enough. -COMMON_OPT = -O0 -marm -mfpu=vfpv3 -fno-omit-frame-pointer +#COMMON_OPT = -O3 -marm -mfpu=vfpv3 -mfloat-abi=hard # Profiling flags COMMON_PROF = -pg From 2840d56aebd7ba028839cdfd8bbc2405181987de Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 19 Oct 2013 09:47:15 +0200 Subject: [PATCH 025/127] added dgemm_kernel for Piledriver --- common.h | 7 + driver/level3/level3.c | 2 +- driver/level3/level3_thread.c | 2 +- kernel/x86_64/KERNEL.PILEDRIVER | 10 +- kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 4599 +++++++++++++++++++ param.h | 10 +- 6 files changed, 4618 insertions(+), 12 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_8x2_piledriver.S diff --git a/common.h b/common.h index fa4c1d745..5c97fbec1 100644 --- a/common.h +++ b/common.h @@ -310,6 +310,13 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif +/*************************************************** +Some no-oprations are enough +***************************************************/ +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif + #ifndef YIELDING #define YIELDING sched_yield() #endif diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..edba5359e 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..850580504 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index bbec23ccf..c2ed50ba1 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -17,11 +17,11 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_6x4_piledriver.S -DGEMMINCOPY = ../generic/gemm_ncopy_6.c -DGEMMITCOPY = ../generic/gemm_tcopy_6.c -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S +DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S +DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S +DGEMMONCOPY = gemm_ncopy_2_bulldozer.S +DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S new file mode 100644 index 000000000..c719e96fc --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -0,0 +1,4599 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/18 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 384 +* DGEMM_Q 168 +* DGEMM_R 12288 +* A_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6912 84.1 GFLOPS with 8 threads on 4 modules (ACML: 81.4 GFLOPS) +* 6144x6912 81.2 GFLOPS with 4 threads on 4 modules (ACML: 81.3 GFLOPS) +* 6144x6912 40.9 GFLOPS with 2 threads on 2 modules (ACML: 41.8 GFLOPS) +* 6144x6912 20.5 GFLOPS with 1 threads on 1 modules (ACML: 21.0 GFLOPS) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x13824 244.5 GFLOPS with 32 threads on 16 modules (ACML: 120.3 GFLOPS) !strange thermal behavior +* 12288x13824 233.9 GFLOPS with 16 threads on 16 modules (ACML: 129.5 GFLOPS) !strange thermal behavior +* 12288x13824 138.1 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 6144x6912 73.6 GFLOPS with 4 threads on 4 modules (ACML: 59.4 GFLOPS) +* 6144x6912 36.8 GFLOPS with 2 threads on 2 modules (ACML: 34.9 GFLOPS) +* 6144x6912 18.7 GFLOPS with 1 threads on 1 modules (ACML: 18.7 GFLOPS) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER1) + +.macro VFMADD231PD_ y1,y2,y0 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x1,x2,x0 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y1,y2,y0 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x1,x2,x0 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + + + +#define A_PR1 256 +#define B_PR1 256 +#define C_PR1 256 + +.macro INIT8x3 + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 +.endm + +.macro KERNEL8x3_INIT + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmulpd %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + vmulpd %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm7 + vmulpd %xmm2,%xmm0,%xmm8 + vmulpd %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm10 + vmulpd %xmm2,%xmm0,%xmm11 + addq $3*SIZE, BO + vmulpd %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + vmulpd %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M2 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -9 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -8 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M3 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup -7 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -6 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -5 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M4 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup -4 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -3 * SIZE(BO), %xmm1 + addq $32 * SIZE, AO + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -2 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M5 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmovddup -1 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 0 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 1 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M6 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup 2 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 3 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 4 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M7 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup 5 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 6 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 7 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M8 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 9 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 10 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 + vmovddup 11 * SIZE(BO), %xmm3 + addq $32 * SIZE, AO + addq $24 * SIZE, BO +.endm + + +.macro KERNEL8x3_E + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + addq $32*SIZE, AO + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + addq $21*SIZE, BO + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_SUBN + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + addq $3*SIZE, BO + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + addq $8*SIZE, AO + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro SAVE8x3 + vmovddup ALPHA, %xmm0 + + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 8 +.endm + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 B_PR1(BO1) + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L6_13 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L6_12: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + //je .L6_12_E + + jne .L6_12 + +.L6_12_E: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + +.L6_13: + + test $2, %rax + jz .L6_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + + +.L6_14: + + test $1, %rax + jz .L6_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + + jmp .L6_16 + +.L6_15: + + INIT8x3 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUBN + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE8x3 + + decq I # i -- + jg .L6_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L7_13 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L7_12: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + //je .L7_12_E + + jne .L7_12 + +.L7_12_E: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_13: + + test $2, %rax + jz .L7_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + +.L7_14: + + test $1, %rax + jz .L7_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_15: + + INIT8x3 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUBN + dec %rax + jne .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE8x3 + + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/param.h b/param.h index c7a9635a6..adfe867f8 100644 --- a/param.h +++ b/param.h @@ -330,9 +330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 6 +#define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 @@ -347,7 +347,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 480 +#define DGEMM_DEFAULT_P 384 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 168 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 @@ -371,7 +371,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_R sgemm_r #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r From f6b50057e259761f8b2662e64272861cabb66607 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 19 Oct 2013 10:52:20 +0200 Subject: [PATCH 026/127] corrected and testet FMA3 Code --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 23 +++++++++------ kernel/x86_64/dgemm_kernel_16x2_haswell.S | 29 +++++++++++++++++-- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 35 ++++++++++++++++++----- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 24 +++++++++------- 4 files changed, 83 insertions(+), 28 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index 0561c0f72..bac773969 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -37,6 +37,11 @@ /*********************************************************************/ /********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK +* * 2013/08/16 Saar * Parameter: * CGEMM_DEFAULT_UNROLL_N 2 @@ -139,7 +144,7 @@ #endif -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) @@ -188,41 +193,41 @@ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) .macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) .macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) .macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm #else .macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm .macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y0,\y1,\y2 + vfnmadd231ps \y1,\y2,\y0 .endm #endif diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S index 67a7ed3f0..e015bbdcc 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -36,6 +36,31 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK + +* +* +* 2013/08/15 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 2 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 384 +* SGEMM_DEFAULT_Q 168 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 2.31 times faster than sandybridge +* 4 threads: 2.26 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ + #define ASSEMBLER #include "common.h" @@ -130,11 +155,11 @@ #else .macro VFMADD231PD_ y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y2,\y1,\y0 .endm .macro VFMADD231SD_ x0,x1,x2 - vfmadd231sd \x0,\x1,\x2 + vfmadd231sd \x2,\x1,\x0 .endm #endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index 922096115..c6489277d 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -36,6 +36,28 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK +* +* 2013/08/15 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 168 +* +* BLASTEST: OK +* +* Performance: +* 1 thread: 2.22 times faster than sandybridge +* 4 threads: 2.26 times faster than sandybridge +* +* Compile for FMA3: OK +* +*********************************************************************/ #define ASSEMBLER #include "common.h" @@ -60,7 +82,6 @@ #define SP %rbx #define BO1 %rdi -#define BO2 %r15 #define CO2 %rdx #ifndef WINDOWS_ABI @@ -131,11 +152,11 @@ #else .macro VFMADD231PS_ y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 + vfmadd231ps \y1,\y2,\y0 .endm .macro VFMADD231SS_ x0,x1,x2 - vfmadd231ss \x0,\x1,\x2 + vfmadd231ss \x1,\x2,\x0 .endm #endif @@ -791,7 +812,7 @@ movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 + vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 @@ -836,8 +857,8 @@ #ifdef TRMMKERNEL - vmovss %xmm12, OFFSET - vmovss %xmm12, KK + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK #ifndef LEFT negq KK #endif @@ -1629,7 +1650,7 @@ .L4_60: #if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK + addq $4, KK #endif decq J // j -- diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index d189b517b..f4b8142ce 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -37,6 +37,11 @@ /*********************************************************************/ /********************************************************************* +* 2013/10/19 Saar +* BLASTEST : +* CTEST : OK +* TEST : OK +* * 2013/08/16 Saar * Parameter: * ZGEMM_DEFAULT_UNROLL_N 2 @@ -44,7 +49,6 @@ * ZGEMM_DEFAULT_P 112 * ZGEMM_DEFAULT_Q 224 * -* BLASTEST: OK * * Performance: * 1 thread: 1.80 times faster than sandybridge @@ -138,7 +142,7 @@ #endif -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) @@ -187,41 +191,41 @@ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) .macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) .macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) .macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 + vfmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm #else .macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm .macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y0,\y1,\y2 + vfnmadd231pd \y1,\y2,\y0 .endm #endif From fe8c5666f9f0f12321a918edae789384cb1ac6de Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 20 Oct 2013 16:52:26 +0200 Subject: [PATCH 027/127] optimized dgemm_kernel for HASWELL --- driver/level3/level3.c | 2 +- driver/level3/level3_thread.c | 2 +- kernel/x86_64/dgemm_kernel_16x2_haswell.S | 778 ++++++++-------------- param.h | 59 +- 4 files changed, 335 insertions(+), 506 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..959c7f1cc 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..cd99172d3 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S index e015bbdcc..2907a6871 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -37,28 +37,26 @@ /*********************************************************************/ /********************************************************************* -* 2013/10/19 Saar -* BLASTEST : +* 2013/10/20 Saar +* BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/08/15 Saar +* 2013/10/20 Saar * Parameter: -* SGEMM_DEFAULT_UNROLL_N 2 -* SGEMM_DEFAULT_UNROLL_M 16 -* SGEMM_DEFAULT_P 384 -* SGEMM_DEFAULT_Q 168 +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 * -* BLASTEST: OK -* -* Performance: -* 1 thread: 2.31 times faster than sandybridge -* 4 threads: 2.26 times faster than sandybridge -* -* Compile for FMA3: OK * +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) *********************************************************************/ @@ -165,13 +163,106 @@ #endif -#define A_PR1 384 -#define B_PR1 192 +#define A_PR1 512 +#define B_PR1 256 /******************************************************************************************* * 3 lines of N *******************************************************************************************/ +.macro KERNEL16x3_SUBN + prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovaps -8 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovaps -4 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3*SIZE , BO + addq $16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + //prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + //prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $3*SIZE , BO + addq $8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3*SIZE , BO + addq $4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3*SIZE , BO + addq $2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3*SIZE , BO + addq $1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + .macro KERNEL16x3_1 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 @@ -1800,7 +1891,7 @@ movq A, AO // aoffset = a - addq $32 * SIZE, AO + addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) @@ -1810,80 +1901,58 @@ .L6_11: leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) vzeroall movq K, %rax - andq $-8, %rax // K = K - ( K % 8 ) + sarq $1, %rax // K / 8 je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 + ALIGN_5 .L6_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L6_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L6_16 - - jmp .L6_12 - ALIGN_4 +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 .L6_16: movq K, %rax - andq $7, %rax # if (k & 1) + andq $1, %rax # if (k & 1) je .L6_19 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_17: - KERNEL16x3_SUB + KERNEL16x3_SUBN - jl .L6_17 + dec %rax + jne .L6_17 ALIGN_4 @@ -1913,57 +1982,30 @@ .L6_20_1: leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3, %rax je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_20_2: - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 - - je .L6_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 - - je .L6_20_6 + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN - jmp .L6_20_2 + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 ALIGN_4 .L6_20_6: @@ -1972,21 +2014,15 @@ andq $7, %rax # if (k & 1) je .L6_20_9 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_20_7: - KERNEL8x3_SUB + KERNEL8x3_SUBN - jl .L6_20_7 + dec %rax + jne .L6_20_7 ALIGN_4 @@ -2009,57 +2045,30 @@ .L6_21: leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3, %rax je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_22: - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 - - je .L6_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN - je .L6_26 - - jmp .L6_22 + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 ALIGN_4 .L6_26: @@ -2068,21 +2077,14 @@ andq $7, %rax # if (k & 1) je .L6_29 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_27: - KERNEL4x3_SUB + KERNEL4x3_SUBN - jl .L6_27 + dec %rax + jne .L6_27 ALIGN_4 @@ -2102,51 +2104,29 @@ .L6_31: leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3, %rax je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_32: - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - je .L6_36 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - je .L6_36 + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN - jmp .L6_32 + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 ALIGN_4 .L6_36: @@ -2155,21 +2135,14 @@ andq $7, %rax # if (k & 1) je .L6_39 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_37: - KERNEL2x3_SUB + KERNEL2x3_SUBN - jl .L6_37 + dec %rax + jne .L6_37 ALIGN_4 @@ -2188,50 +2161,31 @@ .L6_41: leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3,%rax je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_42: - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - je .L6_46 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN - je .L6_46 - - jmp .L6_42 + dec %rax + jne .L6_42 ALIGN_4 .L6_46: @@ -2240,20 +2194,14 @@ andq $7, %rax # if (k & 1) je .L6_49 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L6_47: - KERNEL1x3_SUB + KERNEL1x3_SUBN - jl .L6_47 + dec %rax + jne .L6_47 ALIGN_4 @@ -2276,7 +2224,7 @@ movq A, AO // aoffset = a - addq $32 * SIZE, AO + addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) @@ -2286,57 +2234,40 @@ .L7_11: leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) vzeroall movq K, %rax - andq $-8, %rax // K = K - ( K % 8 ) + sarq $3, %rax // K / 8 je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 + ALIGN_5 .L7_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L7_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L7_16 - - jmp .L7_12 +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 ALIGN_4 .L7_16: @@ -2345,22 +2276,14 @@ andq $7, %rax # if (k & 1) je .L7_19 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 + ALIGN_5 .L7_17: - KERNEL16x3_SUB + KERNEL16x3_SUBN - jl .L7_17 - ALIGN_4 + dec %rax + jne .L7_17 .L7_19: @@ -2389,57 +2312,31 @@ .L7_20_1: leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3, %rax je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_20_2: - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 - - je .L7_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+128(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN - je .L7_20_6 + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN - jmp .L7_20_2 + dec %rax + jne .L7_20_2 ALIGN_4 .L7_20_6: @@ -2448,21 +2345,14 @@ andq $7, %rax # if (k & 1) je .L7_20_9 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_20_7: - KERNEL8x3_SUB + KERNEL8x3_SUBN - jl .L7_20_7 + dec %rax + jne .L7_20_7 ALIGN_4 .L7_20_9: @@ -2484,57 +2374,31 @@ .L7_21: leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3, %rax je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_22: - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN - je .L7_26 + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 - - je .L7_26 - - jmp .L7_22 + dec %rax + jne .L7_22 ALIGN_4 .L7_26: @@ -2543,21 +2407,14 @@ andq $7, %rax # if (k & 1) je .L7_29 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_27: - KERNEL4x3_SUB + KERNEL4x3_SUBN - jl .L7_27 + dec %rax + jne .L7_27 ALIGN_4 @@ -2577,51 +2434,31 @@ .L7_31: leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3, %rax je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_32: - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - je .L7_36 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN - je .L7_36 - - jmp .L7_32 + dec %rax + jne .L7_32 ALIGN_4 .L7_36: @@ -2630,21 +2467,14 @@ andq $7, %rax # if (k & 1) je .L7_39 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_37: - KERNEL2x3_SUB + KERNEL2x3_SUBN - jl .L7_37 + dec %rax + jne .L7_37 ALIGN_4 @@ -2663,50 +2493,30 @@ .L7_41: leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO + addq $12 * SIZE, BO vzeroall movq K, %rax - andq $-8, %rax + sarq $3, %rax je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - je .L7_46 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN - je .L7_46 - - jmp .L7_42 + dec %rax + jne .L7_42 ALIGN_4 .L7_46: @@ -2715,20 +2525,14 @@ andq $7, %rax # if (k & 1) je .L7_49 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax ALIGN_4 .L7_47: - KERNEL1x3_SUB + KERNEL1x3_SUBN - jl .L7_47 + dec %rax + jne .L7_47 ALIGN_4 diff --git a/param.h b/param.h index e4b3871b1..38ac15cf5 100644 --- a/param.h +++ b/param.h @@ -1164,6 +1164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SWITCH_RATIO 4 #ifdef ARCH_X86 + #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 @@ -1177,44 +1178,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 + #else -#define SGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_M 8 + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 -#define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 + #endif +#ifdef ARCH_X86 + #define SGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_R sgemm_r -//#define SGEMM_DEFAULT_R 1024 - #define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_R dgemm_r -//#define DGEMM_DEFAULT_R 1024 - #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r - #define CGEMM_DEFAULT_P 128 -//#define CGEMM_DEFAULT_R cgemm_r #define CGEMM_DEFAULT_R 1024 - #define ZGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_R zgemm_r -//#define ZGEMM_DEFAULT_R 1024 - #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r - #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 @@ -1222,7 +1218,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 192 #define XGEMM_DEFAULT_Q 128 -#define GETRF_FACTOR 0.72 +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 192 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 192 + +#define SGEMM_DEFAULT_Q 168 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + #endif From a77c71eaf5c149ce7aee812df2e07e583ca26846 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 28 Oct 2013 10:23:47 +0100 Subject: [PATCH 028/127] added highly optimized dgemm_kernel for HASWELL --- driver/level3/level3.c | 3 +- driver/level3/level3_thread.c | 2 - kernel/x86_64/KERNEL.HASWELL | 77 +- kernel/x86_64/dgemm_kernel_4x4_haswell.S | 3479 ++++++++++++++++++++++ param.h | 11 +- 5 files changed, 3514 insertions(+), 58 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_4x4_haswell.S diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 959c7f1cc..a44022398 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -334,13 +334,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_jj = min_j + js - jjs; #if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index cd99172d3..673afcf97 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -368,8 +368,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_jj = MIN(n_to, xxx + div_n) - jjs; #if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index c321be752..447481019 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,63 +1,43 @@ -SGEMMKERNEL = sgemm_kernel_8x8_sandy.S -SGEMMINCOPY = -SGEMMITCOPY = -SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMKERNEL = sgemm_kernel_16x4_haswell.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_4x8_sandy.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -#DGEMMONCOPY = gemm_ncopy_4.S + +DGEMMKERNEL = dgemm_kernel_4x4_haswell.S +DGEMMINCOPY = +DGEMMITCOPY = DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -#DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S -CGEMMKERNEL = cgemm_kernel_4x8_sandy.S -CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c -CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c + +CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S -ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S -ZGEMMINCOPY = -ZGEMMITCOPY = -ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -ZGEMMINCOPYOBJ = -ZGEMMITCOPYOBJ = + +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S -#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S -#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S -#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S - -#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S -#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S -#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S -#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S - -#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S -#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S -#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S -#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S - -#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S -#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S -#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S -#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -78,7 +58,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S + diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S new file mode 100644 index 000000000..1bfb71572 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S @@ -0,0 +1,3479 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/********************************************************************* +* 2013/10/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/27 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 4 +* DGEMM_DEFAULT_P 512 +* DGEMM_DEFAULT_Q 256 +* A_PR1 512 +* B_PR1 512 +* +* +* Performance at 9216x9216x9216: +* 1 thread: 53.3 GFLOPS (MKL: 54) +* 2 threads: 100.0 GFLOPS (MKL: 97) +* 3 threads: 147.0 GFLOPS (MKL: 133) +* 4 threads: 184.0 GFLOPS (MKL: 170) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*12 + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermpd $0xb1 , %ymm5, %ymm5 + vpermpd $0xb1 , %ymm7, %ymm7 + + vblendpd $0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $0x05, %ymm5, %ymm4, %ymm1 + vblendpd $0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $0xb1 , %ymm9 , %ymm9 + vpermpd $0xb1 , %ymm11, %ymm11 + + vblendpd $0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermpd $0xb1 , %ymm13, %ymm13 + vpermpd $0xb1 , %ymm15, %ymm15 + + vblendpd $0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $0x05, %ymm13, %ymm12, %ymm1 + vblendpd $0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $0x05, %ymm15, %ymm14, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $4*SIZE, BO + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $8*SIZE, AO + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $8*SIZE, AO + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $4*SIZE, BO + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $4*SIZE, AO + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermpd $0xb1 , %ymm5, %ymm5 + vpermpd $0xb1 , %ymm7, %ymm7 + + vblendpd $0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $0x05, %ymm5, %ymm4, %ymm1 + vblendpd $0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $2*SIZE, BO + addq $4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $2*SIZE, BO + addq $2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $2*SIZE, BO + addq $1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL4x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + addq $1*SIZE, BO + addq $4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + + addq $4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $1*SIZE, BO + addq $2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $1*SIZE, BO + addq $1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv12 // N / 12 + movq %rdx, Nmod12 // N % 12 + + + movq Ndiv12, J + cmpq $0, J + je .L4_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $2,%rax // K * 4 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $1 , %rax // K / 2 + jz .L12_01a_2 + ALIGN_4 + +.L12_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetcht0 512(BO3) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm5 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 4 * SIZE(BO2), %ymm6 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups 4 * SIZE(BO3), %ymm7 + + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + + vmovups %ymm5, 12 * SIZE(BO) + vmovups %ymm6, 16 * SIZE(BO) + vmovups %ymm7, 20 * SIZE(BO) + + addq $8 * SIZE ,BO1 + addq $8 * SIZE ,BO2 + addq $8 * SIZE ,BO3 + addq $24 *SIZE ,BO + + decq %rax + jnz .L12_01a_1 + + + +.L12_01a_2: + + movq K, %rax + andq $1, %rax // K % 2 + jz .L12_03c + ALIGN_4 + + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO2 + addq $4*SIZE,BO3 + addq $12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + movq BO3, B // next offset of B + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + .align 32 + +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + decq J // j -- + jg .L12_01 + + +.L4_0: + + cmpq $0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $2, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + .align 32 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + .align 32 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + .align 32 + +.L1_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv12 // N / 4 + movq %rdx, Nmod12 // N % 4 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + + + movq Ndiv12, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + .align 32 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + .align 32 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + .align 32 + +.L1_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/param.h b/param.h index 38ac15cf5..b6c1f0301 100644 --- a/param.h +++ b/param.h @@ -1182,14 +1182,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 @@ -1221,17 +1221,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 192 +#define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 192 #define SGEMM_DEFAULT_Q 168 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 168 #define ZGEMM_DEFAULT_Q 168 #define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r +//#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 13824 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r From afe44b0241864889918a5f3390950574fad84657 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 28 Oct 2013 14:23:48 +0100 Subject: [PATCH 029/127] tests and code cleanup of gemm_kernels for HASWELL --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 93 ++++++++++----------- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 93 ++++++++++----------- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 98 +++++++++++------------ param.h | 8 +- 4 files changed, 136 insertions(+), 156 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index bac773969..9729e6d70 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -1,61 +1,51 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ /********************************************************************* -* 2013/10/19 Saar -* BLASTEST : +* 2013/10/28 Saar +* BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/08/16 Saar +* 2013/10/28 Saar * Parameter: * CGEMM_DEFAULT_UNROLL_N 2 * CGEMM_DEFAULT_UNROLL_M 8 -* CGEMM_DEFAULT_P 224 -* CGEMM_DEFAULT_Q 224 +* CGEMM_DEFAULT_P 384 +* CGEMM_DEFAULT_Q 192 +* A_PR1 512 +* B_PR1 512 * -* BLASTEST: OK +* Performance at 6912x6912x6912: +* 1 thread: 84 GFLOPS (SANDYBRIDGE: 60) (MKL: 86) +* 2 threads: 153 GFLOPS (SANDYBRIDGE: 114) (MKL: 155) +* 3 threads: 224 GFLOPS (SANDYBRIDGE: 162) (MKL: 222) +* 4 threads: 278 GFLOPS (SANDYBRIDGE: 223) (MKL: 279) * -* Performance: -* 1 thread: 2.04 times faster than sandybridge -* 4 threads: 1.96 times faster than sandybridge -* -* Compile for FMA3: OK * *********************************************************************/ @@ -235,8 +225,8 @@ #endif -#define A_PR1 384 -#define B_PR1 192 +#define A_PR1 512 +#define B_PR1 512 /***************************************************************************************************************************/ @@ -338,6 +328,9 @@ vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 8 * SIZE(CO1, LDC) + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + .endm /***************************************************************************************************************************/ diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index c6489277d..78adbafbb 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -1,61 +1,51 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ /********************************************************************* -* 2013/10/19 Saar -* BLASTEST : +* 2013/10/28 Saar +* BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/08/15 Saar +* 2013/10/28 Saar * Parameter: * SGEMM_DEFAULT_UNROLL_N 4 * SGEMM_DEFAULT_UNROLL_M 16 * SGEMM_DEFAULT_P 768 -* SGEMM_DEFAULT_Q 168 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 * -* BLASTEST: OK -* -* Performance: -* 1 thread: 2.22 times faster than sandybridge -* 4 threads: 2.26 times faster than sandybridge * -* Compile for FMA3: OK +* Performance at 9216x9216x9216: +* 1 thread: 86 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 157 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 235 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 288 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) * *********************************************************************/ @@ -162,8 +152,8 @@ #endif -#define A_PR1 384 -#define B_PR1 192 +#define A_PR1 512 +#define B_PR1 512 /******************************************************************************************* * 4 lines of N @@ -230,6 +220,11 @@ vmovups %ymm10, (CO2, LDC) vmovups %ymm11, 8 * SIZE(CO2, LDC) + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + .endm diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index f4b8142ce..949f90bea 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -1,62 +1,53 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/10/19 Saar -* BLASTEST : +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/******************************************************************************** +* 2013/10/28 Saar +* BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/08/16 Saar +* 2013/10/28 Saar * Parameter: * ZGEMM_DEFAULT_UNROLL_N 2 * ZGEMM_DEFAULT_UNROLL_M 4 -* ZGEMM_DEFAULT_P 112 -* ZGEMM_DEFAULT_Q 224 +* ZGEMM_DEFAULT_P 256 +* ZGEMM_DEFAULT_Q 128 +* A_PR1 512 +* B_PR1 512 * * -* Performance: -* 1 thread: 1.80 times faster than sandybridge -* 4 threads: 1.74 times faster than sandybridge +* Performance at 4608x4608x4608: +* 1 thread: 43 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 85 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 122 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 156 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) * -* Compile for FMA3: OK -* -*********************************************************************/ +********************************************************************************/ #define ASSEMBLER @@ -232,8 +223,8 @@ #endif -#define A_PR1 384 -#define B_PR1 192 +#define A_PR1 512 +#define B_PR1 512 /***************************************************************************************************/ .macro KERNEL4x2_SUB @@ -335,7 +326,8 @@ vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 4 * SIZE(CO1, LDC) - + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) .endm diff --git a/param.h b/param.h index b6c1f0301..dd613fbf1 100644 --- a/param.h +++ b/param.h @@ -1223,12 +1223,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 192 +#define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 -#define CGEMM_DEFAULT_Q 168 -#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r //#define DGEMM_DEFAULT_R dgemm_r From 7bccff1512f1e03c8c9e7598ca1879f72443b48f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 29 Oct 2013 22:53:04 +0100 Subject: [PATCH 030/127] added sgemm_kernel for PILEDRIVER --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/sgemm_kernel_16x2_piledriver.S | 5258 ++++++++++++++++++ param.h | 4 +- 3 files changed, 5261 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/sgemm_kernel_16x2_piledriver.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index c2ed50ba1..2bc0ae709 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -7,7 +7,7 @@ DAXPYKERNEL = daxpy_bulldozer.S DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S -SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S +SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S new file mode 100644 index 000000000..dcfed6bc5 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S @@ -0,0 +1,5258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/29 Saar +* +* Parameter: +* UNROLL_M 16 +* UNROLL_N 2 +* SGEMM_P 768 +* SGEMM_Q 192 +* SGEMM_R 12288 +* A_PR1 384 +* B_PR1 192 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) +* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) +* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) +* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) +* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) +* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) +* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) +* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) +* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/param.h b/param.h index adfe867f8..a833085d3 100644 --- a/param.h +++ b/param.h @@ -358,7 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 @@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r From 1cf4b974b2dd271f4014550190a9e05e5bca229c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 30 Oct 2013 09:12:17 +0100 Subject: [PATCH 031/127] added zgemm_kernel for Piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/zgemm_kernel_2x2_piledriver.S | 1428 +++++++++++++++++++ 2 files changed, 1429 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/zgemm_kernel_2x2_piledriver.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 2bc0ae709..5a5b42b64 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -36,7 +36,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S +ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S new file mode 100644 index 000000000..9f1392d78 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S @@ -0,0 +1,1428 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/30 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/30 Saar +* +* Parameter: +* UNROLL_M 2 +* UNROLL_N 2 +* ZGEMM_P 384 +* ZGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) +* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) +* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) +* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) +* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) +* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) +* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) +* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) +* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE From e172b70ea29054f4353dfcd0e6f0120e50b5a100 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 31 Oct 2013 08:38:17 +0100 Subject: [PATCH 032/127] added cgemm_kernel for Piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/cgemm_kernel_4x2_piledriver.S | 1920 +++++++++++++++++++ param.h | 12 +- 3 files changed, 1929 insertions(+), 5 deletions(-) create mode 100644 kernel/x86_64/cgemm_kernel_4x2_piledriver.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 5a5b42b64..6c262c774 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -27,7 +27,7 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S +CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S new file mode 100644 index 000000000..931316285 --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S @@ -0,0 +1,1920 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +/********************************************************************* +* +* 2013/10/31 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 4 +* UNROLL_N 2 +* CGEMM_P 768 +* CGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) +* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) +* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) +* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) +* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) +* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) +* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) +* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) +* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/param.h b/param.h index a833085d3..a6990f7c3 100644 --- a/param.h +++ b/param.h @@ -348,25 +348,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 #define SGEMM_DEFAULT_R 12288 From 5118a7f4d1f701a23f76c7c4cbcdead7d5566b86 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 31 Oct 2013 11:53:26 +0100 Subject: [PATCH 033/127] small optimizations on dgemm_kernel for Piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 5 +- kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 142 +++++--------------- param.h | 2 +- 3 files changed, 41 insertions(+), 108 deletions(-) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 6c262c774..abed953c3 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -54,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index c719e96fc..cc0ebef8a 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -28,37 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /********************************************************************* * -* 2013/10/18 Saar +* 2013/10/31 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/18 Saar +* 2013/10/31 Saar * * Parameter: * UNROLL_M 8 * UNROLL_N 2 -* DGEMM_P 384 +* DGEMM_P 768 * DGEMM_Q 168 * DGEMM_R 12288 -* A_PR1 256 +* A_PR1 512 +* B_PR1 256 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * -* 6144x6912 84.1 GFLOPS with 8 threads on 4 modules (ACML: 81.4 GFLOPS) -* 6144x6912 81.2 GFLOPS with 4 threads on 4 modules (ACML: 81.3 GFLOPS) -* 6144x6912 40.9 GFLOPS with 2 threads on 2 modules (ACML: 41.8 GFLOPS) -* 6144x6912 20.5 GFLOPS with 1 threads on 1 modules (ACML: 21.0 GFLOPS) +* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) +* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * -* 12288x13824 244.5 GFLOPS with 32 threads on 16 modules (ACML: 120.3 GFLOPS) !strange thermal behavior -* 12288x13824 233.9 GFLOPS with 16 threads on 16 modules (ACML: 129.5 GFLOPS) !strange thermal behavior -* 12288x13824 138.1 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) -* 6144x6912 73.6 GFLOPS with 4 threads on 4 modules (ACML: 59.4 GFLOPS) -* 6144x6912 36.8 GFLOPS with 2 threads on 2 modules (ACML: 34.9 GFLOPS) -* 6144x6912 18.7 GFLOPS with 1 threads on 1 modules (ACML: 18.7 GFLOPS) +* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior +* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior +* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) +* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) +* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) * *********************************************************************/ @@ -168,9 +169,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define A_PR1 256 +#define A_PR1 512 #define B_PR1 256 -#define C_PR1 256 +#define C_PR1 64 .macro INIT8x3 vxorpd %xmm4 , %xmm4 , %xmm4 @@ -454,9 +455,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE8x3 vmovddup ALPHA, %xmm0 - prefetcht0 C_PR1(CO1) - prefetcht0 C_PR1(CO1,LDC) - prefetcht0 C_PR1(CO1,LDC,2) vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 @@ -489,6 +487,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + addq $8 * SIZE, CO1 # coffset += 8 .endm @@ -1284,6 +1286,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpq $3, %rax jl .L6_13 + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 @@ -1299,22 +1304,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_12: + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 + prefetcht0 B_PR1+104(BO) KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax - //je .L6_12_E - jne .L6_12 .L6_12_E: + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 @@ -1432,31 +1440,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) @@ -1548,31 +1550,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) @@ -1651,31 +1647,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) @@ -1753,6 +1743,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpq $3, %rax jl .L7_13 + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 @@ -1768,22 +1761,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_12: + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 + prefetcht0 B_PR1+104(BO) KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax - //je .L7_12_E - jne .L7_12 .L7_12_E: + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 @@ -1904,31 +1900,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) @@ -2019,31 +2009,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) @@ -2127,31 +2111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) @@ -2277,13 +2255,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -2291,13 +2267,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -2399,13 +2373,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -2413,13 +2385,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -2503,13 +2473,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -2517,13 +2485,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -2600,13 +2566,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -2614,13 +2578,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -2743,7 +2705,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -2756,7 +2717,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -2851,7 +2811,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -2864,7 +2823,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -2946,7 +2904,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) @@ -3036,7 +2993,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -3049,7 +3005,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -3317,13 +3272,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -3331,13 +3284,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -3502,13 +3453,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -3516,13 +3465,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -3667,13 +3614,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -3681,13 +3626,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -3818,13 +3761,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -3832,13 +3773,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -4023,7 +3962,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -4036,7 +3974,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -4186,7 +4123,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -4199,7 +4135,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -4335,7 +4270,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) @@ -4476,7 +4410,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -4489,7 +4422,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) diff --git a/param.h b/param.h index a6990f7c3..c4d15323a 100644 --- a/param.h +++ b/param.h @@ -347,7 +347,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 #define ZGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 768 #else From 02bc36ac79b06142b97e2c7a93632f668f3e6b4e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 1 Nov 2013 18:22:27 +0100 Subject: [PATCH 034/127] added sgemm_ncopy routine and made some improvements on cgemm_kernel for ARMV7 --- common_arm.h | 34 +-- kernel/arm/KERNEL.ARMV7 | 10 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 18 +- kernel/arm/sgemm_ncopy_4_vfpv3.S | 344 ++++++++++++++++++++++++++++ param.h | 16 +- 5 files changed, 393 insertions(+), 29 deletions(-) create mode 100644 kernel/arm/sgemm_ncopy_4_vfpv3.S diff --git a/common_arm.h b/common_arm.h index b61efd7c1..e3d1d4079 100644 --- a/common_arm.h +++ b/common_arm.h @@ -80,31 +80,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER -static void INLINE blas_lock(volatile unsigned long *address){ +static void __inline blas_lock(volatile BLASULONG *address){ + + int register ret; -// long int ret, val = 1; -/* do { while (*address) {YIELDING;}; __asm__ __volatile__( - "1: ll %0, %3\n" - " ori %2, %0, 1\n" - " sc %2, %1\n" - " beqz %2, 1b\n" - " andi %2, %0, 1\n" - " sync\n" - : "=&r" (val), "=m" (address), "=&r" (ret) - : "m" (address) - : "memory"); + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); } while (ret); -*/ + } -static inline unsigned int rpcc(void){ - unsigned long ret=0; +static inline BLASULONG rpcc(void){ + BLASULONG ret=0; + struct timeval tv; + gettimeofday(&tv,NULL); + ret=1000000* tv.tv_sec + tv.tv_usec; return ret; } diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 43153798e..ec692d5c2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -89,7 +89,7 @@ ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMINCOPY = SGEMMITCOPY = -SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = @@ -99,12 +99,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o #DGEMMKERNEL = ../generic/gemmkernel_2x2.c #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMINCOPY = dgemm_ncopy_4_vfpv3.S -DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = +DGEMMITCOPY = DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index abbbac831..4cebcab77 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -26,11 +26,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/16 Saar +* 2013/11/01 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * +* 2013/11/01 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* CGEMM_P 96 +* CGEMM_Q 120 +* CGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS +* 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS +* 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS +* 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS **************************************************************************************/ #define ASSEMBLER diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S new file mode 100644 index 000000000..34fbb3252 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -0,0 +1,344 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/11 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 96 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + flds s2 , [ AO3, #0 ] + flds s3 , [ AO4, #0 ] + + flds s4 , [ AO1, #4 ] + flds s8 , [ AO1, #8 ] + flds s12, [ AO1, #12 ] + + flds s5 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s9 , [ AO2, #8 ] + flds s13, [ AO2, #12 ] + + flds s6 , [ AO3, #4 ] + add AO2, AO2, #16 + flds s10, [ AO3, #8 ] + flds s14, [ AO3, #12 ] + + flds s7 , [ AO4, #4 ] + add AO3, AO3, #16 + flds s11, [ AO4, #8 ] + flds s15, [ AO4, #12 ] + + fstmias BO!, { s0 - s3 } + add AO4, AO4, #16 + fstmias BO!, { s4 - s7 } + fstmias BO!, { s8 - s15 } + +.endm + +.macro COPY1x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + flds s2 , [ AO3, #0 ] + add AO2, AO2, #4 + flds s3 , [ AO4, #0 ] + + add AO3, AO3, #4 + fstmias BO!, { s0 - s3 } + add AO4, AO4, #4 + +.endm + +.macro COPY4x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s6 , [ AO1, #12 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s5 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY4x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #2 // lda = lda * 4 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble _L2_BEGIN + +_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L4_M4_40 + +_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne _L4_M4_20 + + +_L4_M4_40: + + ands I, M , #3 + ble _L4_M4_END + +_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne _L4_M4_60 + + +_L4_M4_END: + + subs J , J, #1 // j-- + bne _L4_M4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + tst N, #3 + ble _L999 + + tst N, #2 + ble _L1_BEGIN + +_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L2_M4_40 + +_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne _L2_M4_20 + + +_L2_M4_40: + + ands I, M , #3 + ble _L2_M4_END + +_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne _L2_M4_60 + + +_L2_M4_END: + + +/*********************************************************************************************/ + +_L1_BEGIN: + + tst N, #1 + ble _L999 + + +_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L1_M4_40 + +_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne _L1_M4_20 + + +_L1_M4_40: + + ands I, M , #3 + ble _L1_M4_END + +_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne _L1_M4_60 + + +_L1_M4_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index 8d9d0fc47..d6284f5cb 100644 --- a/param.h +++ b/param.h @@ -1814,19 +1814,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 128 +#define SGEMM_DEFAULT_P 192 #define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 24 +#define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 20 -#define SGEMM_DEFAULT_Q 240 -#define DGEMM_DEFAULT_Q 96 -#define CGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 120 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 64 -#define SGEMM_DEFAULT_R 4096 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 +#define SGEMM_DEFAULT_R 16384 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 512 From b3eab8fcb7c3e5d0b90e9d2864c0df6257eb68b0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 2 Nov 2013 09:43:53 +0100 Subject: [PATCH 035/127] minor optimizations on zgemm_kernel for ARMV7 --- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 18 +++++++++++++++++- param.h | 6 +++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 4b01f0429..9c14aec10 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -26,11 +26,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/16 Saar +* 2013/11/02 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * +* 2013/11/02 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* ZGEMM_P 64 +* ZGEMM_Q 120 +* ZGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS +* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS +* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS +* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS **************************************************************************************/ #define ASSEMBLER diff --git a/param.h b/param.h index d6284f5cb..f6895d90b 100644 --- a/param.h +++ b/param.h @@ -1817,17 +1817,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 192 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 -#define ZGEMM_DEFAULT_P 20 +#define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 120 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 -#define ZGEMM_DEFAULT_Q 64 +#define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 16384 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 4096 From 2b801a00a56c8e270e0de628c5e55e93adaeb1b3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 2 Nov 2013 13:06:11 +0100 Subject: [PATCH 036/127] small optimizations on sgemm_kernel for ARMV7 --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 131 ++++++++-------------------- kernel/arm/sgemm_ncopy_4_vfpv3.S | 17 +++- param.h | 6 +- 3 files changed, 54 insertions(+), 100 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 4746a587e..8bc3e5325 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/13 Saar +* 2013/11/02 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/13 Saar +* 2013/11/02 Saar * UNROLL_N 4 * UNROLL_M 4 * DGEMM_P 128 * DGEMM_Q 240 -* DGEMM_R 4096 -* A_PRE 96 -* B_PRE 96 -* C_PRE 64 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 * -* Performance on Odroid U2: +* Performance on Odroid U2: * -* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS -* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS -* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS -* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS **************************************************************************************/ #define ASSEMBLER @@ -92,9 +92,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K1 r7 #define BC r12 -#define A_PRE 96 -#define B_PRE 96 -#define C_PRE 64 +#define A_PRE 128 +#define B_PRE 128 +#define C_PRE 32 /************************************************************************************** * Macro definitions @@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I + pld [ AO , #A_PRE ] fldmias AO!, { s0 - s1 } - pld [ AO , #A_PRE-8 ] + pld [ BO , #B_PRE ] fldmias BO!, { s8 - s9 } - pld [ BO , #B_PRE-8 ] fmuls s16 , s0, s8 fldmias AO!, { s2 - s3 } @@ -162,20 +162,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s1 } + fldmias AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias AO!, { s2 - s3 } + fldmias BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - fldmias BO!, { s8 - s9 } + //fldmias AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - fldmias BO!, { s10 - s11 } + //fldmias BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -191,17 +191,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + fldmias AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + fldmias BO!, { s12 - s15 } + //fldmias AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 - fldmias BO!, { s12 - s13 } fmacs s21 , s1, s9 fmacs s22 , s2, s9 - fldmias BO!, { s14 - s15 } + //fldmias BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -248,10 +248,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB flds s8 , [ BO ] - pld [ BO , #B_PRE ] flds s0 , [ AO ] - pld [ AO , #A_PRE ] flds s1 , [ AO, #4 ] fmacs s16 , s0, s8 @@ -284,16 +282,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA add r4 , CO2, r3 - pld [ CO2 , #C_PRE ] fldmias CO1, { s8 - s11 } - pld [ r4 , #C_PRE ] fmacs s8 , s0 , s16 flds s12, [CO2] @@ -313,6 +308,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s15, s0 , s23 fsts s11, [CO1, #12 ] + pld [ CO1 , #C_PRE ] + fldmias r4, { s8 - s11 } fmacs s8 , s0 , s24 @@ -324,9 +321,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s11, s0 , s27 fsts s15, [CO2, #12 ] + pld [ CO2 , #C_PRE ] + add CO2, r4 , r3 - pld [ CO2 , #C_PRE ] fldmias CO2, { s12 - s15 } @@ -339,7 +337,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [r4 , #12 ] fmacs s15, s0 , s31 + pld [ r4 , #C_PRE ] fstmias CO2, { s12 - s15 } + pld [ CO2 , #C_PRE ] add CO1, CO1, #16 @@ -891,78 +891,29 @@ _L4_M4_20: mov BO, BC - asrs L , K1, #3 // L = L / 8 - cmp L , #3 - blt _L4_M4_30 - .align 5 + asrs L , K1, #1 // L = L / 8 + cmp L , #2 + blt _L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - sub L, L, #2 + subs L, L, #2 + ble _L4_M4_22a + .align 5 _L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 subs L, L, #1 bgt _L4_M4_22 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 +_L4_M4_22a: - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - b _L4_M4_44 - - -_L4_M4_30: - tst L, #3 - ble _L4_M4_40 - - tst L, #2 - ble _L4_M4_32 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - - KERNEL4x4_M1 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E @@ -974,13 +925,7 @@ _L4_M4_32: ble _L4_M4_40 KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 KERNEL4x4_E b _L4_M4_44 @@ -993,7 +938,7 @@ _L4_M4_40: _L4_M4_44: - ands L , K1, #7 // L = L % 8 + ands L , K1, #1 // L = L % 8 ble _L4_M4_100 _L4_M4_46: diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S index 34fbb3252..8af7ed8f2 100644 --- a/kernel/arm/sgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/02 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -218,6 +218,15 @@ _L4_M4_BEGIN: _L4_M4_20: + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + COPY4x4 + + subs I , I , #1 + ble _L4_M4_40 + COPY4x4 subs I , I , #1 diff --git a/param.h b/param.h index f6895d90b..ab0ed91b7 100644 --- a/param.h +++ b/param.h @@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 192 +#define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 -#define SGEMM_DEFAULT_Q 120 +#define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 -#define SGEMM_DEFAULT_R 16384 +#define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 From e31186efd49370280120f3f97b28ec182f71545b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 2 Nov 2013 13:12:21 +0100 Subject: [PATCH 037/127] deleted obsolete dgemm_kernel and dtrmm_kernel --- kernel/arm/dgemm_kernel_8x2_vfpv3.S | 1112 -------------------- kernel/arm/dtrmm_kernel_8x2_vfpv3.S | 1521 --------------------------- 2 files changed, 2633 deletions(-) delete mode 100644 kernel/arm/dgemm_kernel_8x2_vfpv3.S delete mode 100644 kernel/arm/dtrmm_kernel_8x2_vfpv3.S diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S deleted file mode 100644 index 6c1b0f5fd..000000000 --- a/kernel/arm/dgemm_kernel_8x2_vfpv3.S +++ /dev/null @@ -1,1112 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/09/30 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/09/30 Saar -* UNROLL_N 2 -* UNROLL_M 8 -* DGEMM_P 64 -* DGEMM_Q 64 -* DGEMM_R 512 -* A_PRE 192 -* B_PRE 32 -* C_PRE 64 -* -* Performance on Odroid U2: -* -* 1 Core: 1.42 GFLOPS ATLAS: 1.58 GFLOPS -* 2 Cores: 2.81 GFLOPS ATLAS: - GFLOPS -* 3 Cores: 4.05 GFLOPS ATLAS: - GFLOPS -* 4 Cores: 5.40 GFLOPS ATLAS: 3.88 GFLOPS -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 252 - -#define OLD_M r0 -#define OLD_N r1 -#define OLD_K r2 -#define OLD_A r3 -#define OLD_ALPHA d0 - -/****************************************************** -* [fp, #-128] - [fp, #-64] is reserved -* for store and restore of floating point -* registers -*******************************************************/ - -#define C [fp, #-248 ] -#define LDC [fp, #-252 ] -#define M [fp, #-256 ] -#define N [fp, #-260 ] -#define K [fp, #-264 ] -#define A [fp, #-268 ] - -#define ALPHA [fp, #-276 ] - -#define B [fp, #4 ] -#define OLD_C [fp, #8 ] -#define OLD_LDC [fp, #12 ] - -#define I r0 -#define J r1 -#define L r2 - -#define AO r5 -#define BO r6 - -#define CO1 r8 -#define CO2 r9 - -#define K1 r7 -#define BC r12 - -#define A_PRE 192 -#define A_PRE1 224 -#define B_PRE 32 -#define C_PRE 64 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro INIT8x2 - - vsub.f64 d8 , d8 , d8 - vmov.f64 d9 , d8 - vmov.f64 d10, d8 - vmov.f64 d11, d8 - vmov.f64 d12, d8 - vmov.f64 d13, d8 - vmov.f64 d14, d8 - vmov.f64 d15, d8 - vmov.f64 d16, d8 - vmov.f64 d17, d8 - vmov.f64 d18, d8 - vmov.f64 d19, d8 - vmov.f64 d20, d8 - vmov.f64 d21, d8 - vmov.f64 d22, d8 - vmov.f64 d23, d8 - -.endm - - - -.macro KERNEL8x2 - - fldmiad BO!, { d24 , d25} - fldd d0, [ AO ] - fmacd d8 , d0, d24 - fldd d1, [ AO , #8 ] - fmacd d16 , d0, d25 - fldd d2, [ AO , #16 ] - fmacd d9 , d1, d24 - fmacd d17 , d1, d25 - fldd d3, [ AO , #24 ] - fmacd d10 , d2, d24 - fmacd d18 , d2, d25 - fldd d4, [ AO , #32 ] - fmacd d11 , d3, d24 - pld [AO , #A_PRE] - fmacd d19 , d3, d25 - fldd d5, [ AO , #40 ] - - fmacd d12 , d4, d24 - fmacd d20 , d4, d25 - fldd d6, [ AO , #48 ] - fmacd d13 , d5, d24 - fmacd d21 , d5, d25 - - fldd d7, [ AO , #56 ] - fmacd d14 , d6, d24 - fmacd d22 , d6, d25 - pld [AO , #A_PRE+32] - fmacd d15 , d7, d24 - add AO, AO, #64 - fmacd d23 , d7, d25 - -.endm - -.macro SAVE8x2 - - vldr d0, ALPHA - - fldd d24, [CO1] - fldd d25, [CO1, #8 ] - - fmacd d24, d0 , d8 - fldd d8 , [CO2] - fldd d26, [CO1, #16] - fmacd d25, d0 , d9 - fldd d9 , [CO2, #8 ] - fldd d27, [CO1, #24] - fmacd d26, d0 , d10 - fldd d10 , [CO2, #16 ] - fldd d28, [CO1, #32] - fmacd d27, d0 , d11 - fldd d11 , [CO2, #24 ] - fldd d29, [CO1, #40] - fmacd d28, d0 , d12 - fldd d12 , [CO2, #32 ] - fldd d30, [CO1, #48] - fmacd d29, d0 , d13 - fldd d13 , [CO2, #40 ] - fldd d31, [CO1, #56] - fmacd d30, d0 , d14 - fldd d14 , [CO2, #48 ] - fmacd d31, d0 , d15 - fldd d15 , [CO2, #56 ] - - - fmacd d8 , d0 , d16 - fstd d24, [CO1] - fmacd d9 , d0 , d17 - fstd d25, [CO1, #8 ] - fstd d8 , [CO2] - fmacd d10, d0 , d18 - fstd d26, [CO1, #16 ] - fstd d9 , [CO2, #8 ] - fmacd d11, d0 , d19 - fstd d27, [CO1, #24 ] - fstd d10, [CO2, #16 ] - fmacd d12, d0 , d20 - fstd d28, [CO1, #32 ] - fstd d11, [CO2, #24 ] - fmacd d13, d0 , d21 - fstd d29, [CO1, #40 ] - fstd d12, [CO2, #32 ] - fmacd d14, d0 , d22 - fstd d30, [CO1, #48 ] - fstd d13, [CO2, #40 ] - fmacd d15, d0 , d23 - fstd d31, [CO1, #56 ] - fstd d14, [CO2, #48 ] - - add CO1, CO1, #64 - fstd d15, [CO2, #56 ] - add CO2, CO2, #64 - - -.endm - -.macro SAVE8x2_BAD - - vldr d0, ALPHA - vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vmul.f64 d16, d0 , d16 - vmul.f64 d17, d0 , d17 - vmul.f64 d18, d0 , d18 - vmul.f64 d19, d0 , d19 - vmul.f64 d20, d0 , d20 - vmul.f64 d21, d0 , d21 - vmul.f64 d22, d0 , d22 - vmul.f64 d23, d0 , d23 - - vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } - - vadd.f64 d16, d16, d24 - vadd.f64 d17, d17, d25 - vadd.f64 d18, d18, d26 - vadd.f64 d19, d19, d27 - - vadd.f64 d20, d20, d28 - vadd.f64 d21, d21, d29 - vadd.f64 d22, d22, d30 - vadd.f64 d23, d23, d31 - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - - vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - -.endm - - - -/*************************************************************************************/ - - -.macro INIT4x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL4x2 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - - vmul.f64 d6 , d2 , d5 - vmul.f64 d7 , d3 , d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - -.endm - -.macro SAVE4x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vldm CO1, { d0, d1 , d2 , d3 } - vldm CO2, { d4, d5 , d6 , d7 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - - vstm CO1!, { d8 , d9 , d10 , d11 } - vstm CO2!, { d12, d13 ,d14 , d15 } - -.endm - - - -/*************************************************************************************/ - -.macro INIT2x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - -.endm - -.macro KERNEL2x2 - - vldm AO!, { d0, d1 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - -.endm - -.macro SAVE2x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - - vldm CO1, { d0, d1 } - vldm CO2, { d4, d5 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - - vstm CO1!, { d8 , d9 } - vstm CO2!, { d12, d13 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d12, d12, d12 - -.endm - -.macro KERNEL1x2 - - vldm AO!, { d0 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - - vmul.f64 d6 , d0 , d5 - vadd.f64 d12, d12, d6 - -.endm - -.macro SAVE1x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d12, d0 , d12 - - vldm CO1, { d0 } - vldm CO2, { d4 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d12, d12, d4 - - vstm CO1!, { d8 } - vstm CO2!, { d12} - -.endm - -/*************************************************************************************/ - -.macro INIT8x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL8x1 - - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - vldm BO!, { d24 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d27 , d1 , d24 - vadd.f64 d8 , d8 , d26 - vadd.f64 d9 , d9 , d27 - - vmul.f64 d28 , d2 , d24 - vmul.f64 d29 , d3 , d24 - vadd.f64 d10 , d10, d28 - vadd.f64 d11 , d11, d29 - - vmul.f64 d26 , d4 , d24 - vmul.f64 d27 , d5 , d24 - vadd.f64 d12 , d12, d26 - vadd.f64 d13 , d13, d27 - - vmul.f64 d28 , d6 , d24 - vmul.f64 d29 , d7 , d24 - vadd.f64 d14 , d14, d28 - vadd.f64 d15 , d15, d29 - - -.endm - -.macro SAVE8x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vldm CO1, { d0, d1 , d2 , d3 , d4 , d5 , d6 , d7 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } - -.endm - - -/*************************************************************************************/ - -.macro INIT4x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - -.endm - -.macro KERNEL4x1 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - -.endm - -.macro SAVE4x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - - vldm CO1, { d0, d1 , d2 , d3 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vstm CO1!, { d8 , d9 , d10 , d11 } - -.endm - -/*************************************************************************************/ - -.macro INIT2x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - -.endm - -.macro KERNEL2x1 - - vldm AO!, { d0, d1 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - -.endm - -.macro SAVE2x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - - vldm CO1, { d0, d1 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - - vstm CO1!, { d8 , d9 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x1 - - vsub.f64 d8 , d8 , d8 - -.endm - -.macro KERNEL1x1 - - vldm AO!, { d0 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - -.endm - -.macro SAVE1x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - - vldm CO1, { d0 } - - vadd.f64 d8 , d8 , d0 - - vstm CO1!, { d8 } - -.endm - - - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - push {r4 - r9, fp} - add fp, sp, #24 - sub sp, sp, #STACKSIZE // reserve stack - - str OLD_M, M - str OLD_N, N - str OLD_K, K - str OLD_A, A - vstr OLD_ALPHA, ALPHA - - sub r3, fp, #128 - vstm r3, { d8 - d15} // store floating point registers - - ldr r3, OLD_LDC - lsl r3, r3, #3 // ldc = ldc * 8 - str r3, LDC - - ldr r3, OLD_C - str r3, C - - ldr K1, K - ldr BC, B - - ldr J, N - asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN - -_L2_BEGIN: - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add CO2, CO1, r4 // CO2 = C + LDC - add r3 , CO2, r4 // C = CO2 + LDC - str r3 , C // store C - - ldr AO, A // AO = A - pld [AO , #A_PRE-96] - pld [AO , #A_PRE-64] - pld [AO , #A_PRE-32] - -_L2_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L2_M4_BEGIN - -_L2_M8_20: - - pld [CO1, #C_PRE] - pld [CO1, #C_PRE+32] - pld [CO2, #C_PRE] - pld [CO2, #C_PRE+32] - INIT8x2 - - mov BO, BC - asrs L , K1, #3 // L = L / 8 - ble _L2_M8_40 - .align 5 - -_L2_M8_22: - - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - - subs L, L, #1 - bgt _L2_M8_22 - - -_L2_M8_40: - - ands L , K1, #7 // L = L % 8 - ble _L2_M8_100 - -_L2_M8_42: - - KERNEL8x2 - - subs L, L, #1 - bgt _L2_M8_42 - -_L2_M8_100: - - SAVE8x2 - -_L2_M8_END: - - subs I, I, #1 - bgt _L2_M8_20 - - -_L2_M4_BEGIN: - - ldr I, M - tst I , #7 - ble _L2_END - - tst I , #4 - ble _L2_M2_BEGIN - -_L2_M4_20: - - INIT4x2 - - mov BO, BC - asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 - .align 5 - -_L2_M4_22: - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - subs L, L, #1 - bgt _L2_M4_22 - - -_L2_M4_40: - - ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 - -_L2_M4_42: - - KERNEL4x2 - - subs L, L, #1 - bgt _L2_M4_42 - -_L2_M4_100: - - SAVE4x2 - -_L2_M4_END: - - - -_L2_M2_BEGIN: - - tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN - -_L2_M2_20: - - INIT2x2 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L2_M2_40 - -_L2_M2_22: - - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - - subs L, L, #1 - bgt _L2_M2_22 - - -_L2_M2_40: - - ands L , K1, #3 // L = L % 4 - ble _L2_M2_100 - -_L2_M2_42: - - KERNEL2x2 - - subs L, L, #1 - bgt _L2_M2_42 - -_L2_M2_100: - - SAVE2x2 - -_L2_M2_END: - - -_L2_M1_BEGIN: - - tst I, #1 // I = I % 2 - ble _L2_END - -_L2_M1_20: - - INIT1x2 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L2_M1_40 - -_L2_M1_22: - - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - - subs L, L, #1 - bgt _L2_M1_22 - - -_L2_M1_40: - - ands L , K1, #3 // L = L % 4 - ble _L2_M1_100 - -_L2_M1_42: - - KERNEL1x2 - - subs L, L, #1 - bgt _L2_M1_42 - -_L2_M1_100: - - SAVE1x2 - - -_L2_END: - - mov r3, BC - mov r4, K1 - lsl r4, r4, #4 // k * 2 * 8 - add r3, r3, r4 // B = B + K * 2 * 8 - mov BC, r3 - - subs J , #1 // j-- - bgt _L2_BEGIN - - -_L1_BEGIN: - - ldr J, N - tst J , #1 // J = J % 2 - ble _L999 - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add r3 , CO1, r4 // C = CO1 + LDC - str r3 , C // store C - - ldr AO, A // AO = A - - - -_L1_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L1_M4_BEGIN - -_L1_M8_20: - - INIT8x1 - - mov BO, BC - asrs L , K1, #3 // L = L / 8 - ble _L1_M8_40 - -_L1_M8_22: - - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - - subs L, L, #1 - bgt _L1_M8_22 - - -_L1_M8_40: - - ands L , K1, #7 // L = L % 8 - ble _L1_M8_100 - -_L1_M8_42: - - KERNEL8x1 - - subs L, L, #1 - bgt _L1_M8_42 - -_L1_M8_100: - - SAVE8x1 - -_L1_M8_END: - - subs I, I, #1 - bgt _L1_M8_20 - - - - -_L1_M4_BEGIN: - - ldr I, M - tst I, #7 // I = I % 8 - ble _L1_END - - tst I, #4 // I = I % 8 - ble _L1_M2_BEGIN - -_L1_M4_20: - - INIT4x1 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L1_M4_40 - -_L1_M4_22: - - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - - subs L, L, #1 - bgt _L1_M4_22 - - -_L1_M4_40: - - ands L , K1, #3 // L = L % 4 - ble _L1_M4_100 - -_L1_M4_42: - - KERNEL4x1 - - subs L, L, #1 - bgt _L1_M4_42 - -_L1_M4_100: - - SAVE4x1 - -_L1_M4_END: - - - - -_L1_M2_BEGIN: - - tst I, #2 // I = I % 4 - ble _L1_M1_BEGIN - -_L1_M2_20: - - INIT2x1 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L1_M2_40 - -_L1_M2_22: - - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - - subs L, L, #1 - bgt _L1_M2_22 - - -_L1_M2_40: - - ands L , K1, #3 // L = L % 4 - ble _L1_M2_100 - -_L1_M2_42: - - KERNEL2x1 - - subs L, L, #1 - bgt _L1_M2_42 - -_L1_M2_100: - - SAVE2x1 - -_L1_M2_END: - - - -_L1_M1_BEGIN: - - tst I, #1 // I = I % 4 - ble _L1_END - -_L1_M1_20: - - INIT1x1 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L1_M1_40 - -_L1_M1_22: - - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - - subs L, L, #1 - bgt _L1_M1_22 - - -_L1_M1_40: - - ands L , K1, #3 // L = L % 4 - ble _L1_M1_100 - -_L1_M1_42: - - KERNEL1x1 - - subs L, L, #1 - bgt _L1_M1_42 - -_L1_M1_100: - - SAVE1x1 - - -_L1_END: - - - -_L999: - - sub r3, fp, #128 - vldm r3, { d8 - d15} // restore floating point registers - - movs r0, #0 // set return value - sub sp, fp, #24 - pop {r4 - r9, fp} - bx lr - - EPILOGUE - diff --git a/kernel/arm/dtrmm_kernel_8x2_vfpv3.S b/kernel/arm/dtrmm_kernel_8x2_vfpv3.S deleted file mode 100644 index 930616635..000000000 --- a/kernel/arm/dtrmm_kernel_8x2_vfpv3.S +++ /dev/null @@ -1,1521 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/09/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 252 - -#define OLD_M r0 -#define OLD_N r1 -#define OLD_K r2 -#define OLD_A r3 -#define OLD_ALPHA d0 - -/****************************************************** -* [fp, #-128] - [fp, #-64] is reserved -* for store and restore of floating point -* registers -*******************************************************/ - - -#define KK [fp, #-240 ] -#define KKK [fp, #-244] -#define C [fp, #-248 ] -#define LDC [fp, #-252 ] -#define M [fp, #-256 ] -#define N [fp, #-260 ] -#define K [fp, #-264 ] -#define A [fp, #-268 ] - -#define ALPHA [fp, #-276 ] - -#define B [fp, #4 ] -#define OLD_C [fp, #8 ] -#define OLD_LDC [fp, #12 ] -#define OFFSET [fp, #16 ] - -#define I r0 -#define J r1 -#define L r2 - -#define AO r5 -#define BO r6 - -#define CO1 r8 -#define CO2 r9 - -#define K1 r7 -#define BC r12 - -#define A_PRE 128 -#define A_PRE1 160 -#define B_PRE 128 -#define C_PRE 32 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro INIT8x2 - - vsub.f64 d8 , d8 , d8 - vmov.f64 d9 , d8 - vmov.f64 d10, d8 - vmov.f64 d11, d8 - vmov.f64 d12, d8 - vmov.f64 d13, d8 - vmov.f64 d14, d8 - vmov.f64 d15, d8 - vmov.f64 d16, d8 - vmov.f64 d17, d8 - vmov.f64 d18, d8 - vmov.f64 d19, d8 - vmov.f64 d20, d8 - vmov.f64 d21, d8 - vmov.f64 d22, d8 - vmov.f64 d23, d8 - -.endm - -.macro KERNEL8x2_START - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - -.macro KERNEL8x2_M - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - -.macro KERNEL8x2_END - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - - - -.macro KERNEL8x2 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - -.macro SAVE8x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vmul.f64 d16, d0 , d16 - vmul.f64 d17, d0 , d17 - vmul.f64 d18, d0 , d18 - vmul.f64 d19, d0 , d19 - vmul.f64 d20, d0 , d20 - vmul.f64 d21, d0 , d21 - vmul.f64 d22, d0 , d22 - vmul.f64 d23, d0 , d23 - - vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - -.endm - - -/*************************************************************************************/ - - -.macro INIT4x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL4x2 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - - vmul.f64 d6 , d2 , d5 - vmul.f64 d7 , d3 , d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - -.endm - -.macro SAVE4x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vstm CO1!, { d8 , d9 , d10 , d11 } - vstm CO2!, { d12, d13 ,d14 , d15 } - -.endm - - - -/*************************************************************************************/ - -.macro INIT2x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - -.endm - -.macro KERNEL2x2 - - vldm AO!, { d0, d1 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - -.endm - -.macro SAVE2x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - - vstm CO1!, { d8 , d9 } - vstm CO2!, { d12, d13 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d12, d12, d12 - -.endm - -.macro KERNEL1x2 - - vldm AO!, { d0 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - - vmul.f64 d6 , d0 , d5 - vadd.f64 d12, d12, d6 - -.endm - -.macro SAVE1x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d12, d0 , d12 - - vstm CO1!, { d8 } - vstm CO2!, { d12} - -.endm - -/*************************************************************************************/ - -.macro INIT8x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL8x1 - - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - vldm BO!, { d24 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d27 , d1 , d24 - vadd.f64 d8 , d8 , d26 - vadd.f64 d9 , d9 , d27 - - vmul.f64 d28 , d2 , d24 - vmul.f64 d29 , d3 , d24 - vadd.f64 d10 , d10, d28 - vadd.f64 d11 , d11, d29 - - vmul.f64 d26 , d4 , d24 - vmul.f64 d27 , d5 , d24 - vadd.f64 d12 , d12, d26 - vadd.f64 d13 , d13, d27 - - vmul.f64 d28 , d6 , d24 - vmul.f64 d29 , d7 , d24 - vadd.f64 d14 , d14, d28 - vadd.f64 d15 , d15, d29 - - -.endm - -.macro SAVE8x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } - -.endm - - -/*************************************************************************************/ - -.macro INIT4x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - -.endm - -.macro KERNEL4x1 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - -.endm - -.macro SAVE4x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - - vstm CO1!, { d8 , d9 , d10 , d11 } - -.endm - -/*************************************************************************************/ - -.macro INIT2x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - -.endm - -.macro KERNEL2x1 - - vldm AO!, { d0, d1 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - -.endm - -.macro SAVE2x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - - vstm CO1!, { d8 , d9 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x1 - - vsub.f64 d8 , d8 , d8 - -.endm - -.macro KERNEL1x1 - - vldm AO!, { d0 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - -.endm - -.macro SAVE1x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - - vstm CO1!, { d8 } - -.endm - - - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - push {r4 - r9, fp} - add fp, sp, #24 - sub sp, sp, #STACKSIZE // reserve stack - - str OLD_M, M - str OLD_N, N - str OLD_K, K - str OLD_A, A - vstr OLD_ALPHA, ALPHA - - sub r3, fp, #128 - vstm r3, { d8 - d15} // store floating point registers - - ldr r3, OLD_LDC - lsl r3, r3, #3 // ldc = ldc * 8 - str r3, LDC - - ldr r3, OLD_C - str r3, C - - ldr BC, B - - ldr r3, OFFSET -#ifndef LEFT - neg r3 , r3 -#endif - str r3 , KK - - ldr J, N - asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN - -_L2_BEGIN: - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add CO2, CO1, r4 // CO2 = C + LDC - add r3 , CO2, r4 // C = CO2 + LDC - str r3 , C // store C - -#if defined(LEFT) - ldr r3 , OFFSET - str r3 , KK -#endif - - ldr AO, A // AO = A - -_L2_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L2_M4_BEGIN - -_L2_M8_20: - - pld [CO1, #C_PRE] - pld [CO2, #C_PRE] - - INIT8x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #8 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1 , L, #3 // L = L / 8 - ble _L2_M8_40 - .align 5 - -_L2_M8_22: - - pld [BO , #B_PRE] - KERNEL8x2_START - KERNEL8x2_M - pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M - - pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M - pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_END - - subs K1, K1, #1 - bgt _L2_M8_22 - - -_L2_M8_40: - - ands K1 , L, #7 // L = L % 8 - ble _L2_M8_100 - -_L2_M8_42: - - KERNEL8x2 - - subs K1, K1, #1 - bgt _L2_M8_42 - -_L2_M8_100: - - SAVE8x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #8 // number of values in AO - str r3 , KK -#endif - - -_L2_M8_END: - - subs I, I, #1 - bgt _L2_M8_20 - - -_L2_M4_BEGIN: - - ldr I, M - tst I , #7 - ble _L2_END - - tst I , #4 - ble _L2_M2_BEGIN - -_L2_M4_20: - - INIT4x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #4 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #3 // L = L / 8 - ble _L2_M4_40 - .align 5 - -_L2_M4_22: - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - subs K1, K1, #1 - bgt _L2_M4_22 - - -_L2_M4_40: - - ands K1, L, #7 // L = L % 8 - ble _L2_M4_100 - -_L2_M4_42: - - KERNEL4x2 - - subs K1, K1, #1 - bgt _L2_M4_42 - -_L2_M4_100: - - SAVE4x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #4 // number of values in AO - str r3 , KK -#endif - -_L2_M4_END: - - - -_L2_M2_BEGIN: - - tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN - -_L2_M2_20: - - INIT2x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #2 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L2_M2_40 - -_L2_M2_22: - - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - - subs K1, K1, #1 - bgt _L2_M2_22 - - -_L2_M2_40: - - ands K1, L, #3 // L = L % 4 - ble _L2_M2_100 - -_L2_M2_42: - - KERNEL2x2 - - subs K1, K1, #1 - bgt _L2_M2_42 - -_L2_M2_100: - - SAVE2x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #2 // number of values in AO - str r3 , KK -#endif - - -_L2_M2_END: - - -_L2_M1_BEGIN: - - tst I, #1 // I = I % 2 - ble _L2_END - -_L2_M1_20: - - INIT1x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #1 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L2_M1_40 - -_L2_M1_22: - - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - - subs K1, K1, #1 - bgt _L2_M1_22 - - -_L2_M1_40: - - ands K1, L, #3 // L = L % 4 - ble _L2_M1_100 - -_L2_M1_42: - - KERNEL1x2 - - subs K1, K1, #1 - bgt _L2_M1_42 - -_L2_M1_100: - - SAVE1x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #1 // number of values in AO - str r3 , KK -#endif - - - -_L2_END: - - mov r3, BC - ldr r4, K - lsl r4, r4, #4 // k * 2 * 8 - add r3, r3, r4 // B = B + K * 2 * 8 - mov BC, r3 - -#if !defined(LEFT) - ldr r3 , KK - add r3 , r3 , #2 // number of values in BO - str r3 , KK -#endif - - subs J , #1 // j-- - bgt _L2_BEGIN - - -_L1_BEGIN: - - ldr J, N - tst J , #1 // J = J % 2 - ble _L999 - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add r3 , CO1, r4 // C = CO1 + LDC - str r3 , C // store C - -#if defined(LEFT) - ldr r3 , OFFSET - str r3 , KK -#endif - - ldr AO, A // AO = A - - - -_L1_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L1_M4_BEGIN - -_L1_M8_20: - - INIT8x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #8 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #3 // L = L / 8 - ble _L1_M8_40 - -_L1_M8_22: - - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - - subs K1, K1, #1 - bgt _L1_M8_22 - - -_L1_M8_40: - - ands K1, L, #7 // L = L % 8 - ble _L1_M8_100 - -_L1_M8_42: - - KERNEL8x1 - - subs K1, K1, #1 - bgt _L1_M8_42 - -_L1_M8_100: - - SAVE8x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #8 // number of values in AO - str r3 , KK -#endif - - -_L1_M8_END: - - subs I, I, #1 - bgt _L1_M8_20 - - - - -_L1_M4_BEGIN: - - ldr I, M - tst I, #7 // I = I % 8 - ble _L1_END - - tst I, #4 // I = I % 8 - ble _L1_M2_BEGIN - -_L1_M4_20: - - INIT4x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #4 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L1_M4_40 - -_L1_M4_22: - - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - - subs K1, K1, #1 - bgt _L1_M4_22 - - -_L1_M4_40: - - ands K1, L, #3 // L = L % 4 - ble _L1_M4_100 - -_L1_M4_42: - - KERNEL4x1 - - subs K1, K1, #1 - bgt _L1_M4_42 - -_L1_M4_100: - - SAVE4x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #4 // number of values in AO - str r3 , KK -#endif - - -_L1_M4_END: - - - - -_L1_M2_BEGIN: - - tst I, #2 // I = I % 4 - ble _L1_M1_BEGIN - -_L1_M2_20: - - INIT2x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #2 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L1_M2_40 - -_L1_M2_22: - - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - - subs K1, K1, #1 - bgt _L1_M2_22 - - -_L1_M2_40: - - ands K1 , L, #3 // L = L % 4 - ble _L1_M2_100 - -_L1_M2_42: - - KERNEL2x1 - - subs K1, K1, #1 - bgt _L1_M2_42 - -_L1_M2_100: - - SAVE2x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #2 // number of values in AO - str r3 , KK -#endif - - -_L1_M2_END: - - - -_L1_M1_BEGIN: - - tst I, #1 // I = I % 4 - ble _L1_END - -_L1_M1_20: - - INIT1x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #1 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L1_M1_40 - -_L1_M1_22: - - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - - subs K1, K1, #1 - bgt _L1_M1_22 - - -_L1_M1_40: - - ands K1 , L, #3 // L = L % 4 - ble _L1_M1_100 - -_L1_M1_42: - - KERNEL1x1 - - subs K1, K1, #1 - bgt _L1_M1_42 - -_L1_M1_100: - - SAVE1x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #1 // number of values in AO - str r3 , KK -#endif - - - -_L1_END: - - - -_L999: - - sub r3, fp, #128 - vldm r3, { d8 - d15} // restore floating point registers - - movs r0, #0 // set return value - sub sp, fp, #24 - pop {r4 - r9, fp} - bx lr - - EPILOGUE - From 5400a9f4e4c30d7a961983f733b81b607d09ba2d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 10:34:04 +0100 Subject: [PATCH 038/127] redefined functions for TIMING and YIELDING for ARMV7 processor --- common.h | 8 +++++ common_arm.h | 8 +++-- driver/level3/level3.c | 22 +++++++++++- driver/level3/level3_thread.c | 66 +++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/common.h b/common.h index 418ed25f5..a2775520f 100644 --- a/common.h +++ b/common.h @@ -310,10 +310,18 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif + +#ifdef ARMV7 +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif + + /*** To alloc job_t on heap or statck. please https://github.com/xianyi/OpenBLAS/issues/246 diff --git a/common_arm.h b/common_arm.h index e3d1d4079..8c9752d9f 100644 --- a/common_arm.h +++ b/common_arm.h @@ -104,11 +104,13 @@ static void __inline blas_lock(volatile BLASULONG *address){ } -static inline BLASULONG rpcc(void){ - BLASULONG ret=0; +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; struct timeval tv; gettimeofday(&tv,NULL); - ret=1000000* tv.tv_sec + tv.tv_usec; + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); return ret; } diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..d87c5f546 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -36,6 +36,8 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +// #define TIMING 1 + /* This file is a template for level 3 operation */ #ifndef BETA_OPERATION @@ -341,8 +343,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#else +#elif defined(ARMV7) + if (min_jj >= 32) min_jj = 32; + else + if (min_jj >= 16) min_jj = 16; + else + if (min_jj >= 8) min_jj = 8; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -402,12 +412,22 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; +#ifdef ARMV7 + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", + innercost / total * 100., outercost / total * 100., + kernelcost / total * 100.); + + +#else + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); +#endif #endif return 0; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..56c4d6eca 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -36,6 +36,8 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +// #define TIMING 1 + #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif @@ -233,6 +235,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG l1stride, l2size; #ifdef TIMING + +#ifdef ARMV7 + + unsigned long long rpcc_counter; + unsigned long long copy_A = 0; + unsigned long long copy_B = 0; + unsigned long long kernel = 0; + unsigned long long waiting1 = 0; + unsigned long long waiting2 = 0; + unsigned long long waiting3 = 0; + unsigned long long waiting6[MAX_CPU_NUMBER]; + unsigned long long ops = 0; + +#else + BLASULONG rpcc_counter; BLASULONG copy_A = 0; BLASULONG copy_B = 0; @@ -243,6 +260,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG ops = 0; +#endif + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif @@ -320,15 +339,35 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_l = k - ls; +#ifdef ARMV7_1 + if (min_l >= GEMM_Q / 4 * 2) { + min_l = GEMM_Q / 4; + } else { + if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2; + } + +#else if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } +#endif l1stride = 1; min_i = m_to - m_from; +#ifdef ARMV7_1 + if (min_i >= GEMM_P / 4 * 2) { + min_i = GEMM_P / 4; + } else { + if (min_i > GEMM_P / 4) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } else { + if (args -> nthreads == 1) l1stride = 0; + } + } +#else if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -339,6 +378,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } +#endif + START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); @@ -375,6 +416,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#elif defined(ARMV7) + if (min_jj >= 16) min_jj = 16; + else + if (min_jj >= 8) min_jj = 8; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -506,6 +555,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(waiting3); #ifdef TIMING + +#ifdef ARMV7 + + unsigned long long waiting = waiting1 + waiting2 + waiting3; + unsigned long long total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting1 /(double)total * 100., + (double)waiting2 /(double)total * 100., + (double)waiting3 /(double)total * 100., + (double)kernel /(double)total * 100.); + +#else + BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; @@ -516,6 +580,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); +#endif + #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); From cba97daf3c7dbb8d44e4fb52005bb843ada9df0d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 11:04:16 +0100 Subject: [PATCH 039/127] added missing file cblas_noconst.h to the armv7 branch --- cblas_noconst.h | 303 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 cblas_noconst.h diff --git a/cblas_noconst.h b/cblas_noconst.h new file mode 100644 index 000000000..fd2e940c0 --- /dev/null +++ b/cblas_noconst.h @@ -0,0 +1,303 @@ +#ifndef CBLAS_H +#define CBLAS_H + +#include +#include "common.h" + +#ifdef __cplusplus +extern "C" { + /* Assume C declarations for C++ */ +#endif /* __cplusplus */ + +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); + +/* Get the parallelization type which is used by OpenBLAS */ +int openblas_get_parallel(void); +/* OpenBLAS is compiled for sequential use */ +#define OPENBLAS_SEQUENTIAL 0 +/* OpenBLAS is compiled using normal threading model */ +#define OPENBLAS_THREAD 1 +/* OpenBLAS is compiled using OpenMP threading model */ +#define OPENBLAS_OPENMP 2 + + +#define CBLAS_INDEX size_t + +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; + +float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); +float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); +double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); + +openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); + +float cblas_sasum (blasint n, float *x, blasint incx); +double cblas_dasum (blasint n, double *x, blasint incx); +float cblas_scasum(blasint n, float *x, blasint incx); +double cblas_dzasum(blasint n, double *x, blasint incx); + +float cblas_snrm2 (blasint N, float *X, blasint incX); +double cblas_dnrm2 (blasint N, double *X, blasint incX); +float cblas_scnrm2(blasint N, float *X, blasint incX); +double cblas_dznrm2(blasint N, double *X, blasint incX); + +CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); + +void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); +void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); + +void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); +void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); + +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_drotg(double *a, double *b, double *c, double *s); + +void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); +void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); + +void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); + +void cblas_sscal(blasint N, float alpha, float *X, blasint incX); +void cblas_dscal(blasint N, double alpha, double *X, blasint incX); +void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); +void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); +void cblas_csscal(blasint N, float alpha, float *X, blasint incX); +void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); + +void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); +void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); +void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); +void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); + +void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); + +void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); + +void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, + blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, + blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, + float *Y, blasint incY, float *A, blasint lda); +void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, + double *Y, blasint incY, double *A, blasint lda); + +void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); + + +void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, + blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, + blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + + +void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, + float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, + double *X, blasint incX, double beta, double *Y, blasint incY); + +void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); +void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); + +void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); +void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); + +void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); +void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); +void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); +void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); + +void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); +void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); + +void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); + +void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); + +void cblas_xerbla(blasint p, char *rout, char *form, ...); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif From 95aedfa0ffd57d02cc271d579ed32a56ab6758a5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 11:19:32 +0100 Subject: [PATCH 040/127] added missing file arm/Makefile in lapack/laswp --- lapack/laswp/arm/Makefile | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 lapack/laswp/arm/Makefile diff --git a/lapack/laswp/arm/Makefile b/lapack/laswp/arm/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + From 370e3834a93e9d74a5ffacd5b92b9ecc6ed0411c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 11:54:39 +0100 Subject: [PATCH 041/127] added missing file kernel/arm/Makefile --- kernel/arm/Makefile | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 kernel/arm/Makefile diff --git a/kernel/arm/Makefile b/kernel/arm/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm/Makefile @@ -0,0 +1,2 @@ +clean :: + From 6216ab8a7ea408704e156218a98b64554e053edc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 4 Nov 2013 08:33:04 +0100 Subject: [PATCH 042/127] removed obsolete gemm_kernels from haswell branch --- kernel/x86/dgemm_kernel_16x2_haswell.S | 5404 ------------------------ kernel/x86/sgemm_kernel_16x4_haswell.S | 3167 -------------- 2 files changed, 8571 deletions(-) delete mode 100644 kernel/x86/dgemm_kernel_16x2_haswell.S delete mode 100644 kernel/x86/sgemm_kernel_16x4_haswell.S diff --git a/kernel/x86/dgemm_kernel_16x2_haswell.S b/kernel/x86/dgemm_kernel_16x2_haswell.S deleted file mode 100644 index 27a604855..000000000 --- a/kernel/x86/dgemm_kernel_16x2_haswell.S +++ /dev/null @@ -1,5404 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/08/15 Saar -* Parameter: -* SGEMM_DEFAULT_UNROLL_N 2 -* SGEMM_DEFAULT_UNROLL_M 16 -* SGEMM_DEFAULT_P 384 -* SGEMM_DEFAULT_Q 168 -* -* BLASTEST: OK -* -* Performance: -* 1 thread: 2.31 times faster than sandybridge -* 4 threads: 2.26 times faster than sandybridge -* -* Compile for FMA3: OK -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 512*8*4 -#define LB2_OFFSET 512*8*2 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -.macro VFMADD231PD_ y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmaddsd \x0,\x1,\x2,\x0 -.endm - -#else - -.macro VFMADD231PD_ y0,y1,y2 - vfmadd231pd \y0,\y1,\y2 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmadd231sd \x0,\x1,\x2 -.endm - -#endif - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -.macro KERNEL16x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - - - - -.macro KERNEL16x3_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - prefetcht0 A_PR1+64(AO,%rax,SIZE) - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - addq $12, BI - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $64, %rax - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $3 , BI - addq $16, %rax -.endm - -.macro SAVE16x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm15, %ymm15 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 - vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) - vmovups %ymm15,12 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $12, BI - addq $32, %rax -.endm - -.macro KERNEL8x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $3 , BI - addq $8 , %rax -.endm - -.macro SAVE8x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_2 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_3 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_4 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $12, BI - addq $16, %rax -.endm - -.macro KERNEL4x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $3 , BI - addq $4 , %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $12, BI - addq $8, %rax -.endm - -.macro KERNEL2x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $3 , BI - addq $2 , %rax -.endm - -.macro SAVE2x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm12, %xmm12 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) - -.endm - -/*******************************************************************************************/ - -.macro KERNEL1x3_1 - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $12, BI - addq $4, %rax -.endm - -.macro KERNEL1x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $3 , BI - addq $1 , %rax -.endm - -.macro SAVE1x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $8, BI - addq $64, %rax -.endm - -.macro KERNEL16x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $2, BI - addq $16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $8, BI - addq $32, %rax -.endm - -.macro KERNEL8x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $2, BI - addq $8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_2 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_3 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_4 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $8, BI - addq $16, %rax -.endm - -.macro KERNEL4x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $2, BI - addq $4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $8, BI - addq $8, %rax -.endm - -.macro KERNEL2x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $2, BI - addq $2, %rax -.endm - -.macro SAVE2x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_1 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $8, BI - addq $4, %rax -.endm - -.macro KERNEL1x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $2, BI - addq $1, %rax -.endm - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $4, BI - addq $64, %rax -.endm - -.macro KERNEL16x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $1, BI - addq $16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $4, BI - addq $32, %rax -.endm - -.macro KERNEL8x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $1, BI - addq $8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $4, BI - addq $16, %rax -.endm - -.macro KERNEL4x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $1, BI - addq $4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $4, BI - addq $8, %rax -.endm - -.macro KERNEL2x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $1, BI - addq $2 , %rax -.endm - -.macro SAVE2x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $4, BI - addq $4, %rax -.endm - -.macro KERNEL1x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $1, BI - addq $1 , %rax -.endm - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovups 0 * SIZE(BO1), %xmm0 - vmovsd 0 * SIZE(BO2), %xmm2 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups 0*SIZE(BO2), %xmm1 - vmovsd %xmm0, 0*SIZE(BO) - vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L6_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUB - - jl .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 - - je .L6_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUB - - jl .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 - - je .L6_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB - - jl .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - je .L6_36 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB - - jl .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - je .L6_46 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB - - jl .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L7_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x3_1 - KERNEL16x3_2 - KERNEL16x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL16x3_4 - - KERNEL16x3_1 - KERNEL16x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL16x3_3 - KERNEL16x3_4 - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL16x3_SUB - - jl .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 - - je .L7_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1 - KERNEL8x3_2 - KERNEL8x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4 - - KERNEL8x3_1 - KERNEL8x3_2 - prefetcht0 B_PR1+128(BO,BI,8) - KERNEL8x3_3 - KERNEL8x3_4 - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUB - - jl .L7_20_7 - ALIGN_4 - -.L7_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 - - je .L7_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1 - KERNEL4x3_2 - KERNEL4x3_3 - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4 - - KERNEL4x3_1 - KERNEL4x3_2 - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3 - KERNEL4x3_4 - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB - - jl .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - je .L7_36 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - KERNEL2x3_1 - KERNEL2x3_2 - KERNEL2x3_3 - KERNEL2x3_4 - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB - - jl .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - je .L7_46 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - KERNEL1x3_1 - KERNEL1x3_2 - KERNEL1x3_3 - KERNEL1x3_4 - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB - - jl .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif diff --git a/kernel/x86/sgemm_kernel_16x4_haswell.S b/kernel/x86/sgemm_kernel_16x4_haswell.S deleted file mode 100644 index 9c0334b23..000000000 --- a/kernel/x86/sgemm_kernel_16x4_haswell.S +++ /dev/null @@ -1,3167 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/08/15 Saar -* Parameter: -* SGEMM_DEFAULT_UNROLL_N 4 -* SGEMM_DEFAULT_UNROLL_M 16 -* SGEMM_DEFAULT_P 768 -* SGEMM_DEFAULT_Q 168 -* -* BLASTEST: OK -* -* Performance: -* 1 thread: 2.22 times faster than sandybridge -* 4 threads: 2.26 times faster than sandybridge -* -* Compile for FMA3: OK -* -*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 512*8*4 -#define LB2_OFFSET 512*8*2 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) || defined(PILEDRIVER) - -.macro VFMADD231PS_ y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm - -.macro VFMADD231SS_ x0,x1,x2 - vfmaddss \x0,\x1,\x2,\x0 -.endm - -#else - -.macro VFMADD231PS_ y0,y1,y2 - vfmadd231ps \y0,\y1,\y2 -.endm - -.macro VFMADD231SS_ x0,x1,x2 - vfmadd231ss \x0,\x1,\x2 -.endm - -#endif - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - VFMADD231PS_ %ymm7,%ymm3,%ymm1 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm8,%ymm2,%ymm0 - VFMADD231PS_ %ymm9,%ymm2,%ymm1 - VFMADD231PS_ %ymm10,%ymm3,%ymm0 - VFMADD231PS_ %ymm11,%ymm3,%ymm1 - addq $4 , BI - addq $16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm8,%ymm2,%ymm0 - VFMADD231PS_ %ymm10,%ymm3,%ymm0 - addq $4 , BI - addq $8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 - VFMADD231PS_ %xmm6,%xmm3,%xmm0 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm8,%xmm2,%xmm0 - VFMADD231PS_ %xmm10,%xmm3,%xmm0 - addq $4 , BI - addq $4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - VFMADD231SS_ %xmm7,%xmm3,%xmm1 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm8,%xmm2,%xmm0 - VFMADD231SS_ %xmm9,%xmm2,%xmm1 - VFMADD231SS_ %xmm10,%xmm3,%xmm0 - VFMADD231SS_ %xmm11,%xmm3,%xmm1 - addq $4 , BI - addq $2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddps (CO2), %xmm8,%xmm8 - vaddps 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddps (CO2, LDC), %xmm10,%xmm10 - vaddps 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm8,%xmm2,%xmm0 - VFMADD231SS_ %xmm10,%xmm3,%xmm0 - addq $4 , BI - addq $1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - VFMADD231PS_ %ymm7,%ymm3,%ymm1 - addq $2 , BI - addq $16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - addq $2 , BI - addq $8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 - VFMADD231PS_ %xmm6,%xmm3,%xmm0 - addq $2 , BI - addq $4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - VFMADD231SS_ %xmm7,%xmm3,%xmm1 - addq $2 , BI - addq $2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - addq $2 , BI - addq $1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 - addq $1 , BI - addq $16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - addq $1 , BI - addq $8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 - addq $1 , BI - addq $4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 - addq $1 , BI - addq $2, %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - addq $1 , BI - addq $1, %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $16*SIZE,BO1 - addq $16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - From 82015beaefcc8c702393bb9d20d4552a694753f3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 5 Nov 2013 19:31:22 +0100 Subject: [PATCH 043/127] added zgemm_ncopy_2_vfpv3.S and made assembler labels unique --- kernel/arm/KERNEL.ARMV7 | 2 +- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 305 ++++++++++++---------------- kernel/arm/dgemm_ncopy_4_vfpv3.S | 87 ++++---- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 150 +++++++------- kernel/arm/zgemm_ncopy_2_vfpv3.S | 254 +++++++++++++++++++++++ 5 files changed, 511 insertions(+), 287 deletions(-) create mode 100644 kernel/arm/zgemm_ncopy_2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index ec692d5c2..e30261698 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -115,7 +115,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMONCOPY = zgemm_ncopy_2_vfpv3.S ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index dfe3e3634..7d83def94 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B [fp, #4 ] #define C [fp, #8 ] -#define OLD_LDC [fp, #12 ] +#define OLDdgemm_kernel_LDC [fp, #12 ] #define I r0 #define J r1 @@ -883,7 +883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers - ldr r3, OLD_LDC + ldr r3, OLDdgemm_kernel_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC @@ -892,9 +892,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #2 // J = J / 4 - ble _L2_BEGIN + ble dgemm_kernel_L2_BEGIN -_L4_BEGIN: +dgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -908,21 +908,19 @@ _L4_BEGIN: -_L4_M4_BEGIN: +dgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L4_M2_BEGIN + ble dgemm_kernel_L4_M2_BEGIN -_L4_M4_20: +dgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #3 // L = L / 8 - cmp L , #3 - blt _L4_M4_30 - .align 5 - + cmp L , #2 + blt dgemm_kernel_L4_M4_32 KERNEL4x4_I @@ -935,9 +933,11 @@ _L4_M4_20: KERNEL4x4_M1 KERNEL4x4_M2 - sub L, L, #2 + subs L, L, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 -_L4_M4_22: +dgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -950,55 +950,26 @@ _L4_M4_22: KERNEL4x4_M2 subs L, L, #1 - bgt _L4_M4_22 + bgt dgemm_kernel_L4_M4_22 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 +dgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 - KERNEL4x4_E - - b _L4_M4_44 - - -_L4_M4_30: - tst L, #3 - ble _L4_M4_40 - - tst L, #2 - ble _L4_M4_32 - - KERNEL4x4_I KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b dgemm_kernel_L4_M4_44 -_L4_M4_32: +dgemm_kernel_L4_M4_32: tst L, #1 - ble _L4_M4_40 + ble dgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1010,54 +981,54 @@ _L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b dgemm_kernel_L4_M4_44 -_L4_M4_40: +dgemm_kernel_L4_M4_40: INIT4x4 -_L4_M4_44: +dgemm_kernel_L4_M4_44: ands L , K1, #7 // L = L % 8 - ble _L4_M4_100 + ble dgemm_kernel_L4_M4_100 -_L4_M4_46: +dgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bne _L4_M4_46 + bne dgemm_kernel_L4_M4_46 -_L4_M4_100: +dgemm_kernel_L4_M4_100: SAVE4x4 -_L4_M4_END: +dgemm_kernel_L4_M4_END: subs I, I, #1 - bne _L4_M4_20 + bne dgemm_kernel_L4_M4_20 -_L4_M2_BEGIN: +dgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 - ble _L4_END + ble dgemm_kernel_L4_END tst I, #2 // I = I / 2 - ble _L4_M1_BEGIN + ble dgemm_kernel_L4_M1_BEGIN -_L4_M2_20: +dgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M2_40 + ble dgemm_kernel_L4_M2_40 -_L4_M2_22: +dgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1070,42 +1041,42 @@ _L4_M2_22: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_22 + bgt dgemm_kernel_L4_M2_22 -_L4_M2_40: +dgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 - ble _L4_M2_100 + ble dgemm_kernel_L4_M2_100 -_L4_M2_42: +dgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_42 + bgt dgemm_kernel_L4_M2_42 -_L4_M2_100: +dgemm_kernel_L4_M2_100: SAVE2x4 -_L4_M2_END: +dgemm_kernel_L4_M2_END: -_L4_M1_BEGIN: +dgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L4_END + ble dgemm_kernel_L4_END -_L4_M1_20: +dgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M1_40 + ble dgemm_kernel_L4_M1_40 -_L4_M1_22: +dgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1117,27 +1088,27 @@ _L4_M1_22: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_22 + bgt dgemm_kernel_L4_M1_22 -_L4_M1_40: +dgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 - ble _L4_M1_100 + ble dgemm_kernel_L4_M1_100 -_L4_M1_42: +dgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_42 + bgt dgemm_kernel_L4_M1_42 -_L4_M1_100: +dgemm_kernel_L4_M1_100: SAVE1x4 -_L4_END: +dgemm_kernel_L4_END: mov r3, BC mov r4, K1 @@ -1146,20 +1117,20 @@ _L4_END: mov BC, r3 subs J , #1 // j-- - bgt _L4_BEGIN + bgt dgemm_kernel_L4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +dgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 - ble _L999 + ble dgemm_kernel_L999 tst J , #2 - ble _L1_BEGIN + ble dgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC @@ -1168,28 +1139,25 @@ _L2_BEGIN: str r3 , C // store C ldr AO, A // AO = A - //pld [AO , #A_PRE-96] - //pld [AO , #A_PRE-64] - //pld [AO , #A_PRE-32] -_L2_M4_BEGIN: +dgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L2_M2_BEGIN + ble dgemm_kernel_L2_M2_BEGIN -_L2_M4_20: +dgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 + ble dgemm_kernel_L2_M4_40 .align 5 -_L2_M4_22: +dgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1201,49 +1169,49 @@ _L2_M4_22: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_22 + bgt dgemm_kernel_L2_M4_22 -_L2_M4_40: +dgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 + ble dgemm_kernel_L2_M4_100 -_L2_M4_42: +dgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_42 + bgt dgemm_kernel_L2_M4_42 -_L2_M4_100: +dgemm_kernel_L2_M4_100: SAVE4x2 -_L2_M4_END: +dgemm_kernel_L2_M4_END: subs I, I, #1 - bgt _L2_M4_20 + bgt dgemm_kernel_L2_M4_20 -_L2_M2_BEGIN: +dgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 - ble _L2_END + ble dgemm_kernel_L2_END tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN + ble dgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +dgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M2_40 + ble dgemm_kernel_L2_M2_40 -_L2_M2_22: +dgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1256,42 +1224,42 @@ _L2_M2_22: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_22 + bgt dgemm_kernel_L2_M2_22 -_L2_M2_40: +dgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble dgemm_kernel_L2_M2_100 -_L2_M2_42: +dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_42 + bgt dgemm_kernel_L2_M2_42 -_L2_M2_100: +dgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +dgemm_kernel_L2_M2_END: -_L2_M1_BEGIN: +dgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L2_END + ble dgemm_kernel_L2_END -_L2_M1_20: +dgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble dgemm_kernel_L2_M1_40 -_L2_M1_22: +dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1303,27 +1271,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt dgemm_kernel_L2_M1_22 -_L2_M1_40: +dgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble dgemm_kernel_L2_M1_100 -_L2_M1_42: +dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt dgemm_kernel_L2_M1_42 -_L2_M1_100: +dgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +dgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1333,11 +1301,11 @@ _L2_END: /*********************************************************************************************/ -_L1_BEGIN: +dgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble dgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1346,28 +1314,25 @@ _L1_BEGIN: str r3 , C // store C ldr AO, A // AO = A - //pld [AO , #A_PRE-96] - //pld [AO , #A_PRE-64] - //pld [AO , #A_PRE-32] -_L1_M4_BEGIN: +dgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L1_M2_BEGIN + ble dgemm_kernel_L1_M2_BEGIN -_L1_M4_20: +dgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M4_40 + ble dgemm_kernel_L1_M4_40 .align 5 -_L1_M4_22: +dgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1379,49 +1344,49 @@ _L1_M4_22: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_22 + bgt dgemm_kernel_L1_M4_22 -_L1_M4_40: +dgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 - ble _L1_M4_100 + ble dgemm_kernel_L1_M4_100 -_L1_M4_42: +dgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_42 + bgt dgemm_kernel_L1_M4_42 -_L1_M4_100: +dgemm_kernel_L1_M4_100: SAVE4x1 -_L1_M4_END: +dgemm_kernel_L1_M4_END: subs I, I, #1 - bgt _L1_M4_20 + bgt dgemm_kernel_L1_M4_20 -_L1_M2_BEGIN: +dgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 - ble _L1_END + ble dgemm_kernel_L1_END tst I, #2 // I = I / 2 - ble _L1_M1_BEGIN + ble dgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +dgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M2_40 + ble dgemm_kernel_L1_M2_40 -_L1_M2_22: +dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1434,42 +1399,42 @@ _L1_M2_22: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_22 + bgt dgemm_kernel_L1_M2_22 -_L1_M2_40: +dgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble dgemm_kernel_L1_M2_100 -_L1_M2_42: +dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_42 + bgt dgemm_kernel_L1_M2_42 -_L1_M2_100: +dgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +dgemm_kernel_L1_M2_END: -_L1_M1_BEGIN: +dgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L1_END + ble dgemm_kernel_L1_END -_L1_M1_20: +dgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble dgemm_kernel_L1_M1_40 -_L1_M1_22: +dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1481,30 +1446,30 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt dgemm_kernel_L1_M1_22 -_L1_M1_40: +dgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble dgemm_kernel_L1_M1_100 -_L1_M1_42: +dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt dgemm_kernel_L1_M1_42 -_L1_M1_100: +dgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +dgemm_kernel_L1_END: -_L999: +dgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfpv3.S index bdb63bfdd..ad6692e50 100644 --- a/kernel/arm/dgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/dgemm_ncopy_4_vfpv3.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -68,7 +68,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r3 #define J r12 -#define A_PRE 96 +#define A_PRE 256 /************************************************************************************** * Macro definitions @@ -76,6 +76,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + fldd d0 , [ AO1, #0 ] fldd d1 , [ AO2, #0 ] fldd d2 , [ AO3, #0 ] @@ -199,12 +204,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr BO, B -_L4_BEGIN: +dgemm_ncopy_L4_BEGIN: asrs J, N, #2 // J = N / 4 - ble _L2_BEGIN + ble dgemm_ncopy_L2_BEGIN -_L4_M4_BEGIN: +dgemm_ncopy_L4_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -214,47 +219,47 @@ _L4_M4_BEGIN: add A , AO4, r4 // A = A + 4 * LDA asrs I, M, #2 // I = M / 4 - ble _L4_M4_40 + ble dgemm_ncopy_L4_M4_40 -_L4_M4_20: +dgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne _L4_M4_20 + bne dgemm_ncopy_L4_M4_20 -_L4_M4_40: +dgemm_ncopy_L4_M4_40: ands I, M , #3 - ble _L4_M4_END + ble dgemm_ncopy_L4_M4_END -_L4_M4_60: +dgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne _L4_M4_60 + bne dgemm_ncopy_L4_M4_60 -_L4_M4_END: +dgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne _L4_M4_BEGIN + bne dgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +dgemm_ncopy_L2_BEGIN: tst N, #3 - ble _L999 + ble dgemm_ncopy_L999 tst N, #2 - ble _L1_BEGIN + ble dgemm_ncopy_L1_BEGIN -_L2_M4_BEGIN: +dgemm_ncopy_L2_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -262,75 +267,75 @@ _L2_M4_BEGIN: add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #2 // I = M / 4 - ble _L2_M4_40 + ble dgemm_ncopy_L2_M4_40 -_L2_M4_20: +dgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne _L2_M4_20 + bne dgemm_ncopy_L2_M4_20 -_L2_M4_40: +dgemm_ncopy_L2_M4_40: ands I, M , #3 - ble _L2_M4_END + ble dgemm_ncopy_L2_M4_END -_L2_M4_60: +dgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne _L2_M4_60 + bne dgemm_ncopy_L2_M4_60 -_L2_M4_END: +dgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -_L1_BEGIN: +dgemm_ncopy_L1_BEGIN: tst N, #1 - ble _L999 + ble dgemm_ncopy_L999 -_L1_M4_BEGIN: +dgemm_ncopy_L1_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #2 // I = M / 4 - ble _L1_M4_40 + ble dgemm_ncopy_L1_M4_40 -_L1_M4_20: +dgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne _L1_M4_20 + bne dgemm_ncopy_L1_M4_20 -_L1_M4_40: +dgemm_ncopy_L1_M4_40: ands I, M , #3 - ble _L1_M4_END + ble dgemm_ncopy_L1_M4_END -_L1_M4_60: +dgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne _L1_M4_60 + bne dgemm_ncopy_L1_M4_60 -_L1_M4_END: +dgemm_ncopy_L1_M4_END: -_L999: +dgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 9c14aec10..2d35028a2 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -924,9 +924,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN + ble zgemm_kernel_L1_BEGIN -_L2_BEGIN: +zgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -940,19 +940,19 @@ _L2_BEGIN: -_L2_M2_BEGIN: +zgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L2_M1_BEGIN + ble zgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +zgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L2_M2_30 + blt zgemm_kernel_L2_M2_30 .align 5 @@ -969,7 +969,7 @@ _L2_M2_20: sub L, L, #2 -_L2_M2_22: +zgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 @@ -982,7 +982,7 @@ _L2_M2_22: KERNEL2x2_M2 subs L, L, #1 - bgt _L2_M2_22 + bgt zgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 @@ -994,15 +994,15 @@ _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_30: +zgemm_kernel_L2_M2_30: tst L, #3 - ble _L2_M2_40 + ble zgemm_kernel_L2_M2_40 tst L, #2 - ble _L2_M2_32 + ble zgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 @@ -1025,12 +1025,12 @@ _L2_M2_30: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_32: +zgemm_kernel_L2_M2_32: tst L, #1 - ble _L2_M2_40 + ble zgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 @@ -1042,51 +1042,51 @@ _L2_M2_32: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_40: +zgemm_kernel_L2_M2_40: INIT2x2 -_L2_M2_44: +zgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble zgemm_kernel_L2_M2_100 -_L2_M2_46: +zgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 - bne _L2_M2_46 + bne zgemm_kernel_L2_M2_46 -_L2_M2_100: +zgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +zgemm_kernel_L2_M2_END: subs I, I, #1 - bne _L2_M2_20 + bne zgemm_kernel_L2_M2_20 -_L2_M1_BEGIN: +zgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L2_END + ble zgemm_kernel_L2_END -_L2_M1_20: +zgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble zgemm_kernel_L2_M1_40 -_L2_M1_22: +zgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB @@ -1099,27 +1099,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt zgemm_kernel_L2_M1_22 -_L2_M1_40: +zgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble zgemm_kernel_L2_M1_100 -_L2_M1_42: +zgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt zgemm_kernel_L2_M1_42 -_L2_M1_100: +zgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +zgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1128,17 +1128,17 @@ _L2_END: mov BC, r3 subs J , #1 // j-- - bgt _L2_BEGIN + bgt zgemm_kernel_L2_BEGIN /*********************************************************************************************/ -_L1_BEGIN: +zgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble zgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1148,19 +1148,19 @@ _L1_BEGIN: ldr AO, A // AO = A -_L1_M2_BEGIN: +zgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L1_M1_BEGIN + ble zgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +zgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L1_M2_30 + blt zgemm_kernel_L1_M2_30 .align 5 @@ -1177,7 +1177,7 @@ _L1_M2_20: sub L, L, #2 -_L1_M2_22: +zgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 @@ -1190,7 +1190,7 @@ _L1_M2_22: KERNEL2x1_M2 subs L, L, #1 - bgt _L1_M2_22 + bgt zgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 @@ -1202,15 +1202,15 @@ _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_30: +zgemm_kernel_L1_M2_30: tst L, #3 - ble _L1_M2_40 + ble zgemm_kernel_L1_M2_40 tst L, #2 - ble _L1_M2_32 + ble zgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 @@ -1233,12 +1233,12 @@ _L1_M2_30: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_32: +zgemm_kernel_L1_M2_32: tst L, #1 - ble _L1_M2_40 + ble zgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 @@ -1250,51 +1250,51 @@ _L1_M2_32: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_40: +zgemm_kernel_L1_M2_40: INIT2x1 -_L1_M2_44: +zgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble zgemm_kernel_L1_M2_100 -_L1_M2_46: +zgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 - bne _L1_M2_46 + bne zgemm_kernel_L1_M2_46 -_L1_M2_100: +zgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +zgemm_kernel_L1_M2_END: subs I, I, #1 - bne _L1_M2_20 + bne zgemm_kernel_L1_M2_20 -_L1_M1_BEGIN: +zgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L1_END + ble zgemm_kernel_L1_END -_L1_M1_20: +zgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble zgemm_kernel_L1_M1_40 -_L1_M1_22: +zgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB @@ -1307,31 +1307,31 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt zgemm_kernel_L1_M1_22 -_L1_M1_40: +zgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble zgemm_kernel_L1_M1_100 -_L1_M1_42: +zgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt zgemm_kernel_L1_M1_42 -_L1_M1_100: +zgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +zgemm_kernel_L1_END: -_L999: +zgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/zgemm_ncopy_2_vfpv3.S b/kernel/arm/zgemm_ncopy_2_vfpv3.S new file mode 100644 index 000000000..5ff8ee299 --- /dev/null +++ b/kernel/arm/zgemm_ncopy_2_vfpv3.S @@ -0,0 +1,254 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d5 , [ AO1, #24 ] + + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d6 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #4 // lda = lda * 8 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +zgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble zgemm_ncopy_L1_BEGIN + +zgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L2_M2_40 + +zgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_20 + + +zgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L2_M2_END + +zgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_60 + + +zgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +zgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble zgemm_ncopy_L999 + + +zgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L1_M2_40 + +zgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_20 + + +zgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L1_M2_END + +zgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_60 + + +zgemm_ncopy_L1_M2_END: + + + +zgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From ac50bccbd250a2222e7aa0222c383187886267be Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 5 Nov 2013 20:21:35 +0100 Subject: [PATCH 044/127] added cgemm_ncopy_2_vfpv3.S and made assembler labels unique --- kernel/arm/KERNEL.ARMV7 | 2 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 150 ++++++++-------- kernel/arm/cgemm_ncopy_2_vfpv3.S | 258 +++++++++++++++++++++++++++ kernel/arm/sgemm_kernel_4x4_vfpv3.S | 262 ++++++++++++++-------------- kernel/arm/sgemm_ncopy_4_vfpv3.S | 78 ++++----- 5 files changed, 504 insertions(+), 246 deletions(-) create mode 100644 kernel/arm/cgemm_ncopy_2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index e30261698..cdf370725 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -109,7 +109,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMONCOPY = cgemm_ncopy_2_vfpv3.S CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 4cebcab77..3aba68de8 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/01 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -888,9 +888,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN + ble cgemm_kernel_L1_BEGIN -_L2_BEGIN: +cgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -904,19 +904,19 @@ _L2_BEGIN: -_L2_M2_BEGIN: +cgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L2_M1_BEGIN + ble cgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +cgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L2_M2_30 + blt cgemm_kernel_L2_M2_30 .align 5 @@ -933,7 +933,7 @@ _L2_M2_20: sub L, L, #2 -_L2_M2_22: +cgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 @@ -946,7 +946,7 @@ _L2_M2_22: KERNEL2x2_M2 subs L, L, #1 - bgt _L2_M2_22 + bgt cgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 @@ -958,15 +958,15 @@ _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_30: +cgemm_kernel_L2_M2_30: tst L, #3 - ble _L2_M2_40 + ble cgemm_kernel_L2_M2_40 tst L, #2 - ble _L2_M2_32 + ble cgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 @@ -989,12 +989,12 @@ _L2_M2_30: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_32: +cgemm_kernel_L2_M2_32: tst L, #1 - ble _L2_M2_40 + ble cgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 @@ -1006,51 +1006,51 @@ _L2_M2_32: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_40: +cgemm_kernel_L2_M2_40: INIT2x2 -_L2_M2_44: +cgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble cgemm_kernel_L2_M2_100 -_L2_M2_46: +cgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 - bne _L2_M2_46 + bne cgemm_kernel_L2_M2_46 -_L2_M2_100: +cgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +cgemm_kernel_L2_M2_END: subs I, I, #1 - bne _L2_M2_20 + bne cgemm_kernel_L2_M2_20 -_L2_M1_BEGIN: +cgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L2_END + ble cgemm_kernel_L2_END -_L2_M1_20: +cgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble cgemm_kernel_L2_M1_40 -_L2_M1_22: +cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB @@ -1063,27 +1063,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt cgemm_kernel_L2_M1_22 -_L2_M1_40: +cgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble cgemm_kernel_L2_M1_100 -_L2_M1_42: +cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt cgemm_kernel_L2_M1_42 -_L2_M1_100: +cgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +cgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1092,17 +1092,17 @@ _L2_END: mov BC, r3 subs J , #1 // j-- - bgt _L2_BEGIN + bgt cgemm_kernel_L2_BEGIN /*********************************************************************************************/ -_L1_BEGIN: +cgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble cgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1112,19 +1112,19 @@ _L1_BEGIN: ldr AO, A // AO = A -_L1_M2_BEGIN: +cgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L1_M1_BEGIN + ble cgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +cgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L1_M2_30 + blt cgemm_kernel_L1_M2_30 .align 5 @@ -1141,7 +1141,7 @@ _L1_M2_20: sub L, L, #2 -_L1_M2_22: +cgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 @@ -1154,7 +1154,7 @@ _L1_M2_22: KERNEL2x1_M2 subs L, L, #1 - bgt _L1_M2_22 + bgt cgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 @@ -1166,15 +1166,15 @@ _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_30: +cgemm_kernel_L1_M2_30: tst L, #3 - ble _L1_M2_40 + ble cgemm_kernel_L1_M2_40 tst L, #2 - ble _L1_M2_32 + ble cgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 @@ -1197,12 +1197,12 @@ _L1_M2_30: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_32: +cgemm_kernel_L1_M2_32: tst L, #1 - ble _L1_M2_40 + ble cgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 @@ -1214,51 +1214,51 @@ _L1_M2_32: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_40: +cgemm_kernel_L1_M2_40: INIT2x1 -_L1_M2_44: +cgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble cgemm_kernel_L1_M2_100 -_L1_M2_46: +cgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 - bne _L1_M2_46 + bne cgemm_kernel_L1_M2_46 -_L1_M2_100: +cgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +cgemm_kernel_L1_M2_END: subs I, I, #1 - bne _L1_M2_20 + bne cgemm_kernel_L1_M2_20 -_L1_M1_BEGIN: +cgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L1_END + ble cgemm_kernel_L1_END -_L1_M1_20: +cgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble cgemm_kernel_L1_M1_40 -_L1_M1_22: +cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB @@ -1271,31 +1271,31 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt cgemm_kernel_L1_M1_22 -_L1_M1_40: +cgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble cgemm_kernel_L1_M1_100 -_L1_M1_42: +cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt cgemm_kernel_L1_M1_42 -_L1_M1_100: +cgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +cgemm_kernel_L1_END: -_L999: +cgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers diff --git a/kernel/arm/cgemm_ncopy_2_vfpv3.S b/kernel/arm/cgemm_ncopy_2_vfpv3.S new file mode 100644 index 000000000..08fbd5501 --- /dev/null +++ b/kernel/arm/cgemm_ncopy_2_vfpv3.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s5 , [ AO1, #12 ] + + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s6 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 4 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +cgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble cgemm_ncopy_L1_BEGIN + +cgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L2_M2_40 + +cgemm_ncopy_L2_M2_20: + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + + COPY2x2 + subs I , I , #1 + ble cgemm_ncopy_L2_M2_40 + + COPY2x2 + subs I , I , #1 + bne cgemm_ncopy_L2_M2_20 + + +cgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L2_M2_END + +cgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne cgemm_ncopy_L2_M2_60 + + +cgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +cgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble cgemm_ncopy_L999 + + +cgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L1_M2_40 + +cgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_20 + + +cgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L1_M2_END + +cgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_60 + + +cgemm_ncopy_L1_M2_END: + + + +cgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 8bc3e5325..4031c28db 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -865,9 +865,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #2 // J = J / 4 - ble _L2_BEGIN + ble sgemm_kernel_L2_BEGIN -_L4_BEGIN: +sgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -881,19 +881,19 @@ _L4_BEGIN: -_L4_M4_BEGIN: +sgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L4_M2_BEGIN + ble sgemm_kernel_L4_M2_BEGIN -_L4_M4_20: +sgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #1 // L = L / 8 cmp L , #2 - blt _L4_M4_32 + blt sgemm_kernel_L4_M4_32 @@ -901,81 +901,81 @@ _L4_M4_20: KERNEL4x4_M2 subs L, L, #2 - ble _L4_M4_22a + ble sgemm_kernel_L4_M4_22a .align 5 -_L4_M4_22: +sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #1 - bgt _L4_M4_22 + bgt sgemm_kernel_L4_M4_22 -_L4_M4_22a: +sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b sgemm_kernel_L4_M4_44 -_L4_M4_32: +sgemm_kernel_L4_M4_32: tst L, #1 - ble _L4_M4_40 + ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b _L4_M4_44 + b sgemm_kernel_L4_M4_44 -_L4_M4_40: +sgemm_kernel_L4_M4_40: INIT4x4 -_L4_M4_44: +sgemm_kernel_L4_M4_44: ands L , K1, #1 // L = L % 8 - ble _L4_M4_100 + ble sgemm_kernel_L4_M4_100 -_L4_M4_46: +sgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bne _L4_M4_46 + bne sgemm_kernel_L4_M4_46 -_L4_M4_100: +sgemm_kernel_L4_M4_100: SAVE4x4 -_L4_M4_END: +sgemm_kernel_L4_M4_END: subs I, I, #1 - bne _L4_M4_20 + bne sgemm_kernel_L4_M4_20 -_L4_M2_BEGIN: +sgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 - ble _L4_END + ble sgemm_kernel_L4_END tst I, #2 // I = I / 2 - ble _L4_M1_BEGIN + ble sgemm_kernel_L4_M1_BEGIN -_L4_M2_20: +sgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M2_40 + ble sgemm_kernel_L4_M2_40 -_L4_M2_22: +sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -988,42 +988,42 @@ _L4_M2_22: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_22 + bgt sgemm_kernel_L4_M2_22 -_L4_M2_40: +sgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 - ble _L4_M2_100 + ble sgemm_kernel_L4_M2_100 -_L4_M2_42: +sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_42 + bgt sgemm_kernel_L4_M2_42 -_L4_M2_100: +sgemm_kernel_L4_M2_100: SAVE2x4 -_L4_M2_END: +sgemm_kernel_L4_M2_END: -_L4_M1_BEGIN: +sgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L4_END + ble sgemm_kernel_L4_END -_L4_M1_20: +sgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M1_40 + ble sgemm_kernel_L4_M1_40 -_L4_M1_22: +sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1035,27 +1035,27 @@ _L4_M1_22: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_22 + bgt sgemm_kernel_L4_M1_22 -_L4_M1_40: +sgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 - ble _L4_M1_100 + ble sgemm_kernel_L4_M1_100 -_L4_M1_42: +sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_42 + bgt sgemm_kernel_L4_M1_42 -_L4_M1_100: +sgemm_kernel_L4_M1_100: SAVE1x4 -_L4_END: +sgemm_kernel_L4_END: mov r3, BC mov r4, K1 @@ -1064,20 +1064,20 @@ _L4_END: mov BC, r3 subs J , #1 // j-- - bgt _L4_BEGIN + bgt sgemm_kernel_L4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +sgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 - ble _L999 + ble sgemm_kernel_L999 tst J , #2 - ble _L1_BEGIN + ble sgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC @@ -1092,22 +1092,22 @@ _L2_BEGIN: -_L2_M4_BEGIN: +sgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L2_M2_BEGIN + ble sgemm_kernel_L2_M2_BEGIN -_L2_M4_20: +sgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 + ble sgemm_kernel_L2_M4_40 .align 5 -_L2_M4_22: +sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1119,49 +1119,49 @@ _L2_M4_22: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_22 + bgt sgemm_kernel_L2_M4_22 -_L2_M4_40: +sgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 + ble sgemm_kernel_L2_M4_100 -_L2_M4_42: +sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_42 + bgt sgemm_kernel_L2_M4_42 -_L2_M4_100: +sgemm_kernel_L2_M4_100: SAVE4x2 -_L2_M4_END: +sgemm_kernel_L2_M4_END: subs I, I, #1 - bgt _L2_M4_20 + bgt sgemm_kernel_L2_M4_20 -_L2_M2_BEGIN: +sgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 - ble _L2_END + ble sgemm_kernel_L2_END tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN + ble sgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +sgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M2_40 + ble sgemm_kernel_L2_M2_40 -_L2_M2_22: +sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1174,42 +1174,42 @@ _L2_M2_22: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_22 + bgt sgemm_kernel_L2_M2_22 -_L2_M2_40: +sgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble sgemm_kernel_L2_M2_100 -_L2_M2_42: +sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_42 + bgt sgemm_kernel_L2_M2_42 -_L2_M2_100: +sgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +sgemm_kernel_L2_M2_END: -_L2_M1_BEGIN: +sgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L2_END + ble sgemm_kernel_L2_END -_L2_M1_20: +sgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble sgemm_kernel_L2_M1_40 -_L2_M1_22: +sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1221,27 +1221,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt sgemm_kernel_L2_M1_22 -_L2_M1_40: +sgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble sgemm_kernel_L2_M1_100 -_L2_M1_42: +sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt sgemm_kernel_L2_M1_42 -_L2_M1_100: +sgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +sgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1251,11 +1251,11 @@ _L2_END: /*********************************************************************************************/ -_L1_BEGIN: +sgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble sgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1270,22 +1270,22 @@ _L1_BEGIN: -_L1_M4_BEGIN: +sgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L1_M2_BEGIN + ble sgemm_kernel_L1_M2_BEGIN -_L1_M4_20: +sgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M4_40 + ble sgemm_kernel_L1_M4_40 .align 5 -_L1_M4_22: +sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1297,49 +1297,49 @@ _L1_M4_22: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_22 + bgt sgemm_kernel_L1_M4_22 -_L1_M4_40: +sgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 - ble _L1_M4_100 + ble sgemm_kernel_L1_M4_100 -_L1_M4_42: +sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_42 + bgt sgemm_kernel_L1_M4_42 -_L1_M4_100: +sgemm_kernel_L1_M4_100: SAVE4x1 -_L1_M4_END: +sgemm_kernel_L1_M4_END: subs I, I, #1 - bgt _L1_M4_20 + bgt sgemm_kernel_L1_M4_20 -_L1_M2_BEGIN: +sgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 - ble _L1_END + ble sgemm_kernel_L1_END tst I, #2 // I = I / 2 - ble _L1_M1_BEGIN + ble sgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +sgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M2_40 + ble sgemm_kernel_L1_M2_40 -_L1_M2_22: +sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1352,42 +1352,42 @@ _L1_M2_22: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_22 + bgt sgemm_kernel_L1_M2_22 -_L1_M2_40: +sgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble sgemm_kernel_L1_M2_100 -_L1_M2_42: +sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_42 + bgt sgemm_kernel_L1_M2_42 -_L1_M2_100: +sgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +sgemm_kernel_L1_M2_END: -_L1_M1_BEGIN: +sgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L1_END + ble sgemm_kernel_L1_END -_L1_M1_20: +sgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble sgemm_kernel_L1_M1_40 -_L1_M1_22: +sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1399,30 +1399,30 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt sgemm_kernel_L1_M1_22 -_L1_M1_40: +sgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble sgemm_kernel_L1_M1_100 -_L1_M1_42: +sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt sgemm_kernel_L1_M1_42 -_L1_M1_100: +sgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +sgemm_kernel_L1_END: -_L999: +sgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S index 8af7ed8f2..2d8fa2e24 100644 --- a/kernel/arm/sgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -68,7 +68,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r3 #define J r12 -#define A_PRE 96 +#define A_PRE 192 /************************************************************************************** * Macro definitions @@ -199,12 +199,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr BO, B -_L4_BEGIN: +sgemm_ncopy_L4_BEGIN: asrs J, N, #2 // J = N / 4 - ble _L2_BEGIN + ble sgemm_ncopy_L2_BEGIN -_L4_M4_BEGIN: +sgemm_ncopy_L4_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -214,9 +214,9 @@ _L4_M4_BEGIN: add A , AO4, r4 // A = A + 4 * LDA asrs I, M, #2 // I = M / 4 - ble _L4_M4_40 + ble sgemm_ncopy_L4_M4_40 -_L4_M4_20: +sgemm_ncopy_L4_M4_20: pld [ AO1, #A_PRE ] pld [ AO2, #A_PRE ] @@ -225,45 +225,45 @@ _L4_M4_20: COPY4x4 subs I , I , #1 - ble _L4_M4_40 + ble sgemm_ncopy_L4_M4_40 COPY4x4 subs I , I , #1 - bne _L4_M4_20 + bne sgemm_ncopy_L4_M4_20 -_L4_M4_40: +sgemm_ncopy_L4_M4_40: ands I, M , #3 - ble _L4_M4_END + ble sgemm_ncopy_L4_M4_END -_L4_M4_60: +sgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne _L4_M4_60 + bne sgemm_ncopy_L4_M4_60 -_L4_M4_END: +sgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne _L4_M4_BEGIN + bne sgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +sgemm_ncopy_L2_BEGIN: tst N, #3 - ble _L999 + ble sgemm_ncopy_L999 tst N, #2 - ble _L1_BEGIN + ble sgemm_ncopy_L1_BEGIN -_L2_M4_BEGIN: +sgemm_ncopy_L2_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -271,75 +271,75 @@ _L2_M4_BEGIN: add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #2 // I = M / 4 - ble _L2_M4_40 + ble sgemm_ncopy_L2_M4_40 -_L2_M4_20: +sgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne _L2_M4_20 + bne sgemm_ncopy_L2_M4_20 -_L2_M4_40: +sgemm_ncopy_L2_M4_40: ands I, M , #3 - ble _L2_M4_END + ble sgemm_ncopy_L2_M4_END -_L2_M4_60: +sgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne _L2_M4_60 + bne sgemm_ncopy_L2_M4_60 -_L2_M4_END: +sgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -_L1_BEGIN: +sgemm_ncopy_L1_BEGIN: tst N, #1 - ble _L999 + ble sgemm_ncopy_L999 -_L1_M4_BEGIN: +sgemm_ncopy_L1_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #2 // I = M / 4 - ble _L1_M4_40 + ble sgemm_ncopy_L1_M4_40 -_L1_M4_20: +sgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne _L1_M4_20 + bne sgemm_ncopy_L1_M4_20 -_L1_M4_40: +sgemm_ncopy_L1_M4_40: ands I, M , #3 - ble _L1_M4_END + ble sgemm_ncopy_L1_M4_END -_L1_M4_60: +sgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne _L1_M4_60 + bne sgemm_ncopy_L1_M4_60 -_L1_M4_END: +sgemm_ncopy_L1_M4_END: -_L999: +sgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers From 80a2e901b119c65b470deb9758798cb14aabcbba Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Nov 2013 20:01:18 +0100 Subject: [PATCH 045/127] added dgemm_tcopy_4_vfpv3.S and sgemm_tcopy_4_vfpv3.S --- kernel/arm/KERNEL.ARMV7 | 6 +- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 4 +- kernel/arm/dgemm_tcopy_4_vfpv3.S | 408 ++++++++++++++++++++++++++ kernel/arm/sgemm_tcopy_4_vfpv3.S | 430 ++++++++++++++++++++++++++++ 4 files changed, 842 insertions(+), 6 deletions(-) create mode 100644 kernel/arm/dgemm_tcopy_4_vfpv3.S create mode 100644 kernel/arm/sgemm_tcopy_4_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index cdf370725..10bc4620a 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -90,19 +90,17 @@ SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMOTCOPY = sgemm_tcopy_4_vfpv3.S SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -#DGEMMKERNEL = ../generic/gemmkernel_2x2.c -#DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMOTCOPY = dgemm_tcopy_4_vfpv3.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 7d83def94..ed7f611f1 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B [fp, #4 ] #define C [fp, #8 ] -#define OLDdgemm_kernel_LDC [fp, #12 ] +#define OLD_LDC [fp, #12 ] #define I r0 #define J r1 @@ -883,7 +883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers - ldr r3, OLDdgemm_kernel_LDC + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC diff --git a/kernel/arm/dgemm_tcopy_4_vfpv3.S b/kernel/arm/dgemm_tcopy_4_vfpv3.S new file mode 100644 index 000000000..88a139ad8 --- /dev/null +++ b/kernel/arm/dgemm_tcopy_4_vfpv3.S @@ -0,0 +1,408 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d8 - d11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d12 - d15 } + + fstmiad BO1, { d0 - d15 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x4 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + add r3, r3, LDA + fldmiad r3, { d4 - d5 } + + add r3, r3, LDA + fldmiad r3, { d6 - d7 } + + fstmiad BO2, { d0 - d7 } + add AO1, AO1, #16 + add BO2, BO2, #64 + +.endm + +.macro COPY1x4 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + add r3, r3, LDA + fldmiad r3, { d2 } + + add r3, r3, LDA + fldmiad r3, { d3 } + + fstmiad BO3, { d0 - d3 } + add AO1, AO1, #8 + add BO3, BO3, #32 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + fstmiad BO3, { d0 - d1 } + add AO1, AO1, #8 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 } + + fstmiad BO3, { d0 } + add AO1, AO1, #8 + add BO3, BO3, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #5 // M4 = M * 4 * SIZE + +dgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble dgemm_tcopy_L2_BEGIN + +dgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #128 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L4_M4_40 + +dgemm_tcopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne dgemm_tcopy_L4_M4_20 + + +dgemm_tcopy_L4_M4_40: + + tst N , #2 + ble dgemm_tcopy_L4_M4_60 + + COPY2x4 + + +dgemm_tcopy_L4_M4_60: + + tst N, #1 + ble dgemm_tcopy_L4_M4_END + + COPY1x4 + + +dgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne dgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble dgemm_tcopy_L999 + + tst M, #2 + ble dgemm_tcopy_L1_BEGIN + +dgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L2_M4_40 + +dgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne dgemm_tcopy_L2_M4_20 + + +dgemm_tcopy_L2_M4_40: + + tst N , #2 + ble dgemm_tcopy_L2_M4_60 + + COPY2x2 + +dgemm_tcopy_L2_M4_60: + + tst N , #1 + ble dgemm_tcopy_L2_M4_END + + COPY1x2 + + +dgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +dgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble dgemm_tcopy_L999 + + +dgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L1_M4_40 + +dgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne dgemm_tcopy_L1_M4_20 + + +dgemm_tcopy_L1_M4_40: + + tst N , #2 + ble dgemm_tcopy_L1_M4_60 + + COPY2x1 + +dgemm_tcopy_L1_M4_60: + + tst N , #1 + ble dgemm_tcopy_L1_M4_END + + COPY1x1 + + +dgemm_tcopy_L1_M4_END: + + + +dgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_tcopy_4_vfpv3.S b/kernel/arm/sgemm_tcopy_4_vfpv3.S new file mode 100644 index 000000000..b0a3278ff --- /dev/null +++ b/kernel/arm/sgemm_tcopy_4_vfpv3.S @@ -0,0 +1,430 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4_1 + + pld [ AO1, #A_PRE ] + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY4x4_2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + + +.macro COPY2x4 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + add r3, r3, LDA + fldmias r3, { s4 - s5 } + + add r3, r3, LDA + fldmias r3, { s6 - s7 } + + fstmias BO2, { s0 - s7 } + add AO1, AO1, #8 + add BO2, BO2, #32 + +.endm + +.macro COPY1x4 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + add r3, r3, LDA + fldmias r3, { s2 } + + add r3, r3, LDA + fldmias r3, { s3 } + + fstmias BO3, { s0 - s3 } + add AO1, AO1, #4 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + fstmias BO3, { s0 - s1 } + add AO1, AO1, #4 + add BO3, BO3, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 } + + fstmias BO3, { s0 } + add AO1, AO1, #4 + add BO3, BO3, #4 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #2 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #2 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #4 // M4 = M * 4 * SIZE + +sgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble sgemm_tcopy_L2_BEGIN + +sgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L4_M4_40 + +sgemm_tcopy_L4_M4_20: + + COPY4x4_1 + + subs I , I , #1 + ble sgemm_tcopy_L4_M4_40 + + COPY4x4_2 + + subs I , I , #1 + bne sgemm_tcopy_L4_M4_20 + + +sgemm_tcopy_L4_M4_40: + + tst N , #2 + ble sgemm_tcopy_L4_M4_60 + + COPY2x4 + + +sgemm_tcopy_L4_M4_60: + + tst N, #1 + ble sgemm_tcopy_L4_M4_END + + COPY1x4 + + +sgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne sgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble sgemm_tcopy_L999 + + tst M, #2 + ble sgemm_tcopy_L1_BEGIN + +sgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L2_M4_40 + +sgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne sgemm_tcopy_L2_M4_20 + + +sgemm_tcopy_L2_M4_40: + + tst N , #2 + ble sgemm_tcopy_L2_M4_60 + + COPY2x2 + +sgemm_tcopy_L2_M4_60: + + tst N , #1 + ble sgemm_tcopy_L2_M4_END + + COPY1x2 + + +sgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +sgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble sgemm_tcopy_L999 + + +sgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L1_M4_40 + +sgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne sgemm_tcopy_L1_M4_20 + + +sgemm_tcopy_L1_M4_40: + + tst N , #2 + ble sgemm_tcopy_L1_M4_60 + + COPY2x1 + +sgemm_tcopy_L1_M4_60: + + tst N , #1 + ble sgemm_tcopy_L1_M4_END + + COPY1x1 + + +sgemm_tcopy_L1_M4_END: + + + +sgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 1e8128f41c54a449de9c36a6fbd8c86d351f5215 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Nov 2013 17:15:50 +0100 Subject: [PATCH 046/127] added cgemm_tcopy_2_vfpv3.S and zgemm_tcopy_2_vfpv3.S --- kernel/arm/cgemm_tcopy_2_vfpv3.S | 243 ++++++++++++++++++++++++++++++ kernel/arm/zgemm_tcopy_2_vfpv3.S | 245 +++++++++++++++++++++++++++++++ 2 files changed, 488 insertions(+) create mode 100644 kernel/arm/cgemm_tcopy_2_vfpv3.S create mode 100644 kernel/arm/zgemm_tcopy_2_vfpv3.S diff --git a/kernel/arm/cgemm_tcopy_2_vfpv3.S b/kernel/arm/cgemm_tcopy_2_vfpv3.S new file mode 100644 index 000000000..9036b994d --- /dev/null +++ b/kernel/arm/cgemm_tcopy_2_vfpv3.S @@ -0,0 +1,243 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 -s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #4 // M4 = M * 2 * SIZE * 2 + +cgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble cgemm_tcopy_L1_BEGIN + +cgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L2_M2_60 + +cgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne cgemm_tcopy_L2_M2_40 + +cgemm_tcopy_L2_M2_60: + + tst N , #1 + ble cgemm_tcopy_L2_M2_END + + COPY1x2 + + +cgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +cgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble cgemm_tcopy_L999 + + +cgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L1_M2_60 + + +cgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne cgemm_tcopy_L1_M2_40 + +cgemm_tcopy_L1_M2_60: + + tst N , #1 + ble cgemm_tcopy_L1_M2_END + + COPY1x1 + + +cgemm_tcopy_L1_M2_END: + + + +cgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_tcopy_2_vfpv3.S b/kernel/arm/zgemm_tcopy_2_vfpv3.S new file mode 100644 index 000000000..7e27ca6a6 --- /dev/null +++ b/kernel/arm/zgemm_tcopy_2_vfpv3.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 -d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #4 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #5 // M4 = M * 2 * SIZE * 2 + +zgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble zgemm_tcopy_L1_BEGIN + +zgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L2_M2_60 + +zgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne zgemm_tcopy_L2_M2_40 + +zgemm_tcopy_L2_M2_60: + + tst N , #1 + ble zgemm_tcopy_L2_M2_END + + COPY1x2 + + +zgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +zgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble zgemm_tcopy_L999 + + +zgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L1_M2_60 + + +zgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne zgemm_tcopy_L1_M2_40 + +zgemm_tcopy_L1_M2_60: + + tst N , #1 + ble zgemm_tcopy_L1_M2_END + + COPY1x1 + + +zgemm_tcopy_L1_M2_END: + + + +zgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 8fa93be06ec13bccb1d2e710615bb42e3b694c92 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Nov 2013 17:18:56 +0100 Subject: [PATCH 047/127] added optimized blas level1 copy kernels --- kernel/arm/ccopy_vfpv3.S | 222 ++++++++++++++++++++++++++++++++++++++ kernel/arm/dcopy_vfpv3.S | 222 ++++++++++++++++++++++++++++++++++++++ kernel/arm/scopy_vfpv3.S | 224 +++++++++++++++++++++++++++++++++++++++ kernel/arm/zcopy_vfpv3.S | 223 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 891 insertions(+) create mode 100644 kernel/arm/ccopy_vfpv3.S create mode 100644 kernel/arm/dcopy_vfpv3.S create mode 100644 kernel/arm/scopy_vfpv3.S create mode 100644 kernel/arm/zcopy_vfpv3.S diff --git a/kernel/arm/ccopy_vfpv3.S b/kernel/arm/ccopy_vfpv3.S new file mode 100644 index 000000000..aaba7825e --- /dev/null +++ b/kernel/arm/ccopy_vfpv3.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s7 } + fstmias Y!, { s0 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 - s1 } + fstmias Y!, { s0 - s1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble ccopy_kernel_L999 + + cmp INC_X, #0 + beq ccopy_kernel_L999 + + cmp INC_Y, #0 + beq ccopy_kernel_L999 + + cmp INC_X, #1 + bne ccopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ccopy_kernel_S_BEGIN + +ccopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_F1 + +ccopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne ccopy_kernel_F4 + +ccopy_kernel_F1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne ccopy_kernel_F10 + + b ccopy_kernel_L999 + +ccopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_S1 + +ccopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne ccopy_kernel_S4 + +ccopy_kernel_S1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne ccopy_kernel_S10 + + + + + + +ccopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dcopy_vfpv3.S b/kernel/arm/dcopy_vfpv3.S new file mode 100644 index 000000000..0fad3c4a6 --- /dev/null +++ b/kernel/arm/dcopy_vfpv3.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d0 - d3 } + fstmiad Y!, { d0 - d3 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 } + fstmiad Y!, { d0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble dcopy_kernel_L999 + + cmp INC_X, #0 + beq dcopy_kernel_L999 + + cmp INC_Y, #0 + beq dcopy_kernel_L999 + + cmp INC_X, #1 + bne dcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne dcopy_kernel_S_BEGIN + +dcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_F1 + +dcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne dcopy_kernel_F4 + +dcopy_kernel_F1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne dcopy_kernel_F10 + + b dcopy_kernel_L999 + +dcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_S1 + +dcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne dcopy_kernel_S4 + +dcopy_kernel_S1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne dcopy_kernel_S10 + + + + + + +dcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/scopy_vfpv3.S b/kernel/arm/scopy_vfpv3.S new file mode 100644 index 000000000..e6ceaf2fb --- /dev/null +++ b/kernel/arm/scopy_vfpv3.S @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F8 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s3 } + fldmias X!, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias Y!, { s4 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 } + fstmias Y!, { s0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble scopy_kernel_L999 + + cmp INC_X, #0 + beq scopy_kernel_L999 + + cmp INC_Y, #0 + beq scopy_kernel_L999 + + cmp INC_X, #1 + bne scopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne scopy_kernel_S_BEGIN + +scopy_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble scopy_kernel_F1 + +scopy_kernel_F8: + + COPY_F8 + + subs I, I, #1 + bne scopy_kernel_F8 + +scopy_kernel_F1: + + ands I, N, #7 + ble scopy_kernel_L999 + +scopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne scopy_kernel_F10 + + b scopy_kernel_L999 + +scopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble scopy_kernel_S1 + +scopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne scopy_kernel_S4 + +scopy_kernel_S1: + + ands I, N, #3 + ble scopy_kernel_L999 + +scopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne scopy_kernel_S10 + + + + + + +scopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zcopy_vfpv3.S b/kernel/arm/zcopy_vfpv3.S new file mode 100644 index 000000000..06f892446 --- /dev/null +++ b/kernel/arm/zcopy_vfpv3.S @@ -0,0 +1,223 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + pld [ X, #X_PRE+32 ] + fldmiad X!, { d0 - d7 } + fstmiad Y!, { d0 - d7 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 - d1 } + fstmiad Y!, { d0 - d1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble zcopy_kernel_L999 + + cmp INC_X, #0 + beq zcopy_kernel_L999 + + cmp INC_Y, #0 + beq zcopy_kernel_L999 + + cmp INC_X, #1 + bne zcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zcopy_kernel_S_BEGIN + +zcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_F1 + +zcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne zcopy_kernel_F4 + +zcopy_kernel_F1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne zcopy_kernel_F10 + + b zcopy_kernel_L999 + +zcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_S1 + +zcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne zcopy_kernel_S4 + +zcopy_kernel_S1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne zcopy_kernel_S10 + + + + + + +zcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From c8f1aeb1549c723ac33124b8cd66522881300e47 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Nov 2013 17:22:03 +0100 Subject: [PATCH 048/127] added optimized blas level1 dot kernels for single and double precision --- kernel/arm/KERNEL.ARMV7 | 29 ++-- kernel/arm/ddot_vfpv3.S | 238 +++++++++++++++++++++++++++ kernel/arm/sdot_vfpv3.S | 347 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 597 insertions(+), 17 deletions(-) create mode 100644 kernel/arm/ddot_vfpv3.S create mode 100644 kernel/arm/sdot_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 10bc4620a..7387be1a2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -45,13 +45,13 @@ DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c -SCOPYKERNEL = ../arm/copy.c -DCOPYKERNEL = ../arm/copy.c -CCOPYKERNEL = ../arm/zcopy.c -ZCOPYKERNEL = ../arm/zcopy.c +SCOPYKERNEL = scopy_vfpv3.S +DCOPYKERNEL = dcopy_vfpv3.S +CCOPYKERNEL = ccopy_vfpv3.S +ZCOPYKERNEL = zcopy_vfpv3.S -SDOTKERNEL = ../arm/dot.c -DDOTKERNEL = ../arm/dot.c +SDOTKERNEL = sdot_vfpv3.S +DDOTKERNEL = ddot_vfpv3.S CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c @@ -108,15 +108,15 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S CGEMMONCOPY = cgemm_ncopy_2_vfpv3.S -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMOTCOPY = cgemm_tcopy_2_vfpv3.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S ZGEMMONCOPY = zgemm_ncopy_2_vfpv3.S -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMOTCOPY = zgemm_tcopy_2_vfpv3.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -138,9 +138,4 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S -ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S - - - diff --git a/kernel/arm/ddot_vfpv3.S b/kernel/arm/ddot_vfpv3.S new file mode 100644 index 000000000..12d9e218b --- /dev/null +++ b/kernel/arm/ddot_vfpv3.S @@ -0,0 +1,238 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X!, { d8 - d9 } + fldmiad Y!, { d4 - d5} + fmacd d0 , d4, d8 + fldmiad X!, { d10 - d11 } + fmacd d1 , d5, d9 + fldmiad Y!, { d6 - d7 } + fmacd d0 , d6, d10 + fmacd d1 , d7, d11 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y!, { d8 } + fmacd d0 , d4, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d4, d8 + + fldmiad X, { d5 } + fldmiad Y, { d9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d5, d9 + + fldmiad X, { d6 } + fldmiad Y, { d10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d6, d10 + + fldmiad X, { d7 } + fldmiad Y, { d11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d7, d11 + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + fmacd d0 , d4, d8 + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + + cmp N, #0 + ble ddot_kernel_L999 + + cmp INC_X, #0 + beq ddot_kernel_L999 + + cmp INC_Y, #0 + beq ddot_kernel_L999 + + cmp INC_X, #1 + bne ddot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ddot_kernel_S_BEGIN + +ddot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_F1 + +ddot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne ddot_kernel_F4 + +ddot_kernel_F1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne ddot_kernel_F10 + + b ddot_kernel_L999 + +ddot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_S1 + +ddot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne ddot_kernel_S4 + +ddot_kernel_S1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne ddot_kernel_S10 + + + + + + +ddot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + vadd.f64 d0 , d0, d1 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sdot_vfpv3.S b/kernel/arm/sdot_vfpv3.S new file mode 100644 index 000000000..164387482 --- /dev/null +++ b/kernel/arm/sdot_vfpv3.S @@ -0,0 +1,347 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK (no test for dsdot) +* TEST : OK (no test for dsdot) +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(DSDOT) + +.macro KERNEL_F4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + +.endm + + +.macro KERNEL_S4 + + nop + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + fldmias X!, { s8 - s9 } + fldmias Y!, { s4 - s5} + fmacs s0 , s4, s8 + fldmias X!, { s10 - s11 } + fmacs s1 , s5, s9 + fldmias Y!, { s6 - s7 } + fmacs s0 , s6, s10 + fmacs s1 , s7, s11 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y!, { s8 } + fmacs s0 , s4, s8 + +.endm + + +.macro KERNEL_S4 + + nop + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s4, s8 + + fldmias X, { s5 } + fldmias Y, { s9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s5, s9 + + fldmias X, { s6 } + fldmias Y, { s10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s6, s10 + + fldmias X, { s7 } + fldmias Y, { s11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s7, s11 + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + fmacs s0 , s4, s8 + add Y, Y, INC_Y + +.endm + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15 } // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + +#if defined(DSDOT) + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + +#else + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + +#endif + + cmp N, #0 + ble sdot_kernel_L999 + + cmp INC_X, #0 + beq sdot_kernel_L999 + + cmp INC_Y, #0 + beq sdot_kernel_L999 + + cmp INC_X, #1 + bne sdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne sdot_kernel_S_BEGIN + +sdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_F1 + +sdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne sdot_kernel_F4 + +sdot_kernel_F1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne sdot_kernel_F10 + + b sdot_kernel_L999 + +sdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_S1 + +sdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne sdot_kernel_S4 + +sdot_kernel_S1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne sdot_kernel_S10 + + + + + + +sdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if defined(DSDOT) + + vadd.f64 d0 , d0, d1 // set return value + +#else + + vadd.f32 s0 , s0, s1 // set return value + +#endif + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 5b36cc0f4725e4cff6f818dd7de4690a9a28c15a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 8 Nov 2013 09:08:11 +0100 Subject: [PATCH 049/127] added blas level1 dot kernels for complex and double complex --- kernel/arm/KERNEL.ARMV7 | 4 +- kernel/arm/cdot_vfpv3.S | 284 +++++++++++++++++++++++++++++++++++++++ kernel/arm/zdot_vfpv3.S | 286 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 572 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/cdot_vfpv3.S create mode 100644 kernel/arm/zdot_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 7387be1a2..a6b1d67d2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -52,8 +52,8 @@ ZCOPYKERNEL = zcopy_vfpv3.S SDOTKERNEL = sdot_vfpv3.S DDOTKERNEL = ddot_vfpv3.S -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot_vfpv3.S +ZDOTKERNEL = zdot_vfpv3.S SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/arm/cdot_vfpv3.S b/kernel/arm/cdot_vfpv3.S new file mode 100644 index 000000000..261808916 --- /dev/null +++ b/kernel/arm/cdot_vfpv3.S @@ -0,0 +1,284 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/08 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + + cmp N, #0 + ble cdot_kernel_L999 + + cmp INC_X, #0 + beq cdot_kernel_L999 + + cmp INC_Y, #0 + beq cdot_kernel_L999 + + cmp INC_X, #1 + bne cdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne cdot_kernel_S_BEGIN + +cdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_F1 + +cdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne cdot_kernel_F4 + +cdot_kernel_F1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne cdot_kernel_F10 + + b cdot_kernel_L999 + +cdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_S1 + +cdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne cdot_kernel_S4 + +cdot_kernel_S1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne cdot_kernel_S10 + + + +cdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if !defined(CONJ) + vsub.f32 s0 , s0, s2 + vadd.f32 s1 , s1, s3 +#else + vadd.f32 s0 , s0, s2 + vsub.f32 s1 , s1, s3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zdot_vfpv3.S b/kernel/arm/zdot_vfpv3.S new file mode 100644 index 000000000..2aa9171b8 --- /dev/null +++ b/kernel/arm/zdot_vfpv3.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/08 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + pld [ X, #X_PRE ] + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + + cmp N, #0 + ble zdot_kernel_L999 + + cmp INC_X, #0 + beq zdot_kernel_L999 + + cmp INC_Y, #0 + beq zdot_kernel_L999 + + cmp INC_X, #1 + bne zdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zdot_kernel_S_BEGIN + +zdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_F1 + +zdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zdot_kernel_F4 + +zdot_kernel_F1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zdot_kernel_F10 + + b zdot_kernel_L999 + +zdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_S1 + +zdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne zdot_kernel_S4 + +zdot_kernel_S1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zdot_kernel_S10 + + + +zdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + +#if !defined(CONJ) + vsub.f64 d0 , d0, d2 + vadd.f64 d1 , d1, d3 +#else + vadd.f64 d0 , d0, d2 + vsub.f64 d1 , d1, d3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 00f33c0134da349d9809ff8d9ad334d43381b91e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 11 Nov 2013 14:20:59 +0100 Subject: [PATCH 050/127] added asum_kernel for all precisions and complex --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/asum_vfpv3.S | 481 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 485 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/asum_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index a6b1d67d2..10db665b2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -35,10 +35,10 @@ DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c -SASUMKERNEL = ../arm/asum.c -DASUMKERNEL = ../arm/asum.c -CASUMKERNEL = ../arm/zasum.c -ZASUMKERNEL = ../arm/zasum.c +SASUMKERNEL = asum_vfpv3.S +DASUMKERNEL = asum_vfpv3.S +CASUMKERNEL = asum_vfpv3.S +ZASUMKERNEL = asum_vfpv3.S SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c diff --git a/kernel/arm/asum_vfpv3.S b/kernel/arm/asum_vfpv3.S new file mode 100644 index 000000000..2b6ceb191 --- /dev/null +++ b/kernel/arm/asum_vfpv3.S @@ -0,0 +1,481 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 +#else + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 +#endif + + cmp N, #0 + ble asum_kernel_L999 + + cmp INC_X, #0 + beq asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + + +asum_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_F1 + + .align 5 + +asum_kernel_F4: + +#if !defined(DOUBLE) && !defined(COMPLEX) + pld [ X, #X_PRE ] +#endif + KERNEL_F4 + + subs I, I, #1 + ble asum_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + + b asum_kernel_L999 + +asum_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_S1 + + .align 5 + +asum_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + +asum_kernel_L999: + + +#if defined(DOUBLE) + vadd.f64 d0 , d0, d1 // set return value +#else + vadd.f32 s0 , s0, s1 // set return value +#endif + + bx lr + + EPILOGUE + From f750103336dff5ddad6cb70177f69277b24a29a6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 11 Nov 2013 15:47:56 +0100 Subject: [PATCH 051/127] small optimizations on dot-kernels --- kernel/arm/cdot_vfpv3.S | 4 ++-- kernel/arm/ddot_vfpv3.S | 22 ++++++++++++++++------ kernel/arm/sdot_vfpv3.S | 4 ++-- kernel/arm/zdot_vfpv3.S | 4 ++-- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/kernel/arm/cdot_vfpv3.S b/kernel/arm/cdot_vfpv3.S index 261808916..b653888df 100644 --- a/kernel/arm/cdot_vfpv3.S +++ b/kernel/arm/cdot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/08 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions diff --git a/kernel/arm/ddot_vfpv3.S b/kernel/arm/ddot_vfpv3.S index 12d9e218b..ab819ec98 100644 --- a/kernel/arm/ddot_vfpv3.S +++ b/kernel/arm/ddot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/07 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions @@ -65,14 +65,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] + fldmiad X!, { d8 } pld [ Y, #X_PRE ] - fldmiad X!, { d8 - d9 } - fldmiad Y!, { d4 - d5} + fldmiad Y!, { d4 } + fldmiad Y!, { d5 } fmacd d0 , d4, d8 - fldmiad X!, { d10 - d11 } + fldmiad X!, { d9 } + fldmiad Y!, { d6 } fmacd d1 , d5, d9 - fldmiad Y!, { d6 - d7 } + fldmiad X!, { d10 } + fldmiad X!, { d11 } fmacd d0 , d6, d10 + fldmiad Y!, { d7 } fmacd d1 , d7, d11 .endm @@ -173,6 +177,12 @@ ddot_kernel_F_BEGIN: ddot_kernel_F4: + KERNEL_F4 + + subs I, I, #1 + ble ddot_kernel_F1 + + KERNEL_F4 subs I, I, #1 diff --git a/kernel/arm/sdot_vfpv3.S b/kernel/arm/sdot_vfpv3.S index 164387482..794e07317 100644 --- a/kernel/arm/sdot_vfpv3.S +++ b/kernel/arm/sdot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/07 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK (no test for dsdot) * TEST : OK (no test for dsdot) @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions diff --git a/kernel/arm/zdot_vfpv3.S b/kernel/arm/zdot_vfpv3.S index 2aa9171b8..1a78b5aec 100644 --- a/kernel/arm/zdot_vfpv3.S +++ b/kernel/arm/zdot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/08 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions From 6da558d2abe339328718ccce7ca7b1b16a8fcae7 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Nov 2013 17:39:13 +0100 Subject: [PATCH 052/127] changes for compatibility with Pathscale compiler --- common_x86.h | 15 +- common_x86_64.h | 10 + kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 260 ++++++++++---------- 3 files changed, 150 insertions(+), 135 deletions(-) diff --git a/common_x86.h b/common_x86.h index 48517d900..8245f7078 100644 --- a/common_x86.h +++ b/common_x86.h @@ -301,12 +301,25 @@ REALNAME: #define PROFCODE #endif + +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits #endif + + +#endif + #ifdef XDOUBLE #define FLD fldt #define FST fstpt diff --git a/common_x86_64.h b/common_x86_64.h index 188903848..4fe23448f 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,10 +372,20 @@ REALNAME: #define PROFCODE #endif +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits +#endif + #endif diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index cc0ebef8a..8585d45de 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /********************************************************************* * -* 2013/10/31 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -144,25 +144,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACK_TOUCH #endif -#if defined(BULLDOZER1) +#if defined(BULLDOZER) -.macro VFMADD231PD_ y1,y2,y0 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 -.macro VFMADD231SD_ x1,x2,x0 - vfmaddsd \x0,\x1,\x2,\x0 -.endm +#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 #else -.macro VFMADD231PD_ y1,y2,y0 - vfmadd231pd \y2,\y1,\y0 -.endm +#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 -.macro VFMADD231SD_ x1,x2,x0 - vfmadd231sd \x2,\x1,\x0 -.endm +#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 #endif @@ -218,46 +210,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x3_M1 vmovups -16 * SIZE(AO), %xmm0 prefetcht0 A_PR1(AO) - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -12 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M2 vmovups -8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+64(AO) vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -9 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -8 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm @@ -265,93 +257,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 0 * SIZE(AO), %xmm0 prefetcht0 A_PR1+128(AO) vmovddup -7 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -6 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -5 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M4 vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup -4 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -3 * SIZE(BO), %xmm1 addq $32 * SIZE, AO - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -2 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M5 vmovups -16 * SIZE(AO), %xmm0 prefetcht0 A_PR1(AO) vmovddup -1 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 0 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 1 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M6 vmovups -8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+64(AO) vmovddup 2 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 3 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 4 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm @@ -359,46 +351,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 0 * SIZE(AO), %xmm0 prefetcht0 A_PR1+128(AO) vmovddup 5 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 6 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 7 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M8 vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 9 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 10 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) vmovddup 11 * SIZE(BO), %xmm3 addq $32 * SIZE, AO addq $24 * SIZE, BO @@ -409,47 +401,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) addq $32*SIZE, AO - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) addq $21*SIZE, BO - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_SUBN vmovddup -12 * SIZE(BO), %xmm1 vmovups -16 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) addq $3*SIZE, BO - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) addq $8*SIZE, AO - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro SAVE8x3 From f1db3862111dfe0f6cd757f74803008dd535a90d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Nov 2013 17:59:11 +0100 Subject: [PATCH 053/127] changes for compatibility with Pathscale compiler --- common_x86.h | 20 +++- common_x86_64.h | 16 ++- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 138 +++++++++------------- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 116 +++++++++--------- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 126 ++++++++------------ 5 files changed, 178 insertions(+), 238 deletions(-) diff --git a/common_x86.h b/common_x86.h index 49e6be29e..8245f7078 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,11 +171,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif -#if defined(SANDYBRIDGE) || defined(HASWELL) -//Enable some optimazation for nehalem. -#define NEHALEM_OPTIMIZATION -#endif - #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION @@ -306,12 +301,25 @@ REALNAME: #define PROFCODE #endif + +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits #endif + + +#endif + #ifdef XDOUBLE #define FLD fldt #define FST fstpt diff --git a/common_x86_64.h b/common_x86_64.h index 8e9d79443..4fe23448f 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -218,12 +218,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER -#if defined(SANDYBRIDGE) || defined(HASWELL) -//Enable some optimazation for nehalem. -#define NEHALEM_OPTIMIZATION -#endif - - #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION @@ -378,10 +372,20 @@ REALNAME: #define PROFCODE #endif +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits +#endif + #endif diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index 9729e6d70..e4aba23e4 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* -* 2013/10/28 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -138,43 +138,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPS_R y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPS_R y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPS_R y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #else -.macro VFMADDPS_R y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #endif @@ -182,43 +166,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPS_R y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #else -.macro VFMADDPS_R y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 -.macro VFMADDPS_I y0,y1,y2 - vfnmadd231ps \y1,\y2,\y0 -.endm +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #endif @@ -234,18 +202,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_R %ymm8,%ymm4,%ymm0 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - VFMADDPS_R %ymm12,%ymm4,%ymm1 + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_I %ymm9,%ymm5,%ymm0 - VFMADDPS_I %ymm13,%ymm5,%ymm1 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_R %ymm10,%ymm6,%ymm0 - VFMADDPS_R %ymm14,%ymm6,%ymm1 + VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_I %ymm11,%ymm7,%ymm0 - VFMADDPS_I %ymm15,%ymm7,%ymm1 + VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) addq $4 , BI addq $16, %rax .endm @@ -338,18 +306,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R %xmm12,%xmm4,%xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 - VFMADDPS_I %xmm13,%xmm5,%xmm1 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R %xmm10,%xmm6,%xmm0 - VFMADDPS_R %xmm14,%xmm6,%xmm1 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I %xmm11,%xmm7,%xmm0 - VFMADDPS_I %xmm15,%xmm7,%xmm1 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) addq $4, BI addq $8, %rax .endm @@ -437,13 +405,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R %xmm10,%xmm6,%xmm0 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I %xmm11,%xmm7,%xmm0 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $4, BI addq $4, %rax .endm @@ -509,13 +477,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x2_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R %xmm10,%xmm6,%xmm0 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I %xmm11,%xmm7,%xmm0 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $4, BI addq $2, %rax .endm @@ -583,11 +551,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_R %ymm8,%ymm4,%ymm0 - VFMADDPS_R %ymm12,%ymm4,%ymm1 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_I %ymm9,%ymm5,%ymm0 - VFMADDPS_I %ymm13,%ymm5,%ymm1 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) addq $2 , BI addq $16, %rax .endm @@ -654,12 +622,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R %xmm12,%xmm4,%xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 - VFMADDPS_I %xmm13,%xmm5,%xmm1 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) addq $2, BI addq $8, %rax .endm @@ -723,9 +691,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $2, BI addq $4, %rax .endm @@ -778,9 +746,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R %xmm8,%xmm4,%xmm0 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I %xmm9,%xmm5,%xmm0 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $2, BI addq $2, %rax .endm diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index 78adbafbb..2f1434ffa 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* -* 2013/10/28 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -131,23 +131,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) -.macro VFMADD231PS_ y0,y1,y2 - vfmaddps \y0,\y1,\y2,\y0 -.endm +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 -.macro VFMADD231SS_ x0,x1,x2 - vfmaddss \x0,\x1,\x2,\x0 -.endm +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 #else -.macro VFMADD231PS_ y0,y1,y2 - vfmadd231ps \y1,\y2,\y0 -.endm +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 -.macro VFMADD231SS_ x0,x1,x2 - vfmadd231ss \x1,\x2,\x0 -.endm +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 #endif @@ -164,16 +156,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - VFMADD231PS_ %ymm7,%ymm3,%ymm1 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm8,%ymm2,%ymm0 - VFMADD231PS_ %ymm9,%ymm2,%ymm1 - VFMADD231PS_ %ymm10,%ymm3,%ymm0 - VFMADD231PS_ %ymm11,%ymm3,%ymm1 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) addq $4 , BI addq $16, %rax .endm @@ -235,12 +227,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm8,%ymm2,%ymm0 - VFMADD231PS_ %ymm10,%ymm3,%ymm0 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) addq $4 , BI addq $8 , %rax .endm @@ -279,12 +271,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 - VFMADD231PS_ %xmm6,%xmm3,%xmm0 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm8,%xmm2,%xmm0 - VFMADD231PS_ %xmm10,%xmm3,%xmm0 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) addq $4 , BI addq $4 , %rax .endm @@ -323,16 +315,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - VFMADD231SS_ %xmm7,%xmm3,%xmm1 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm8,%xmm2,%xmm0 - VFMADD231SS_ %xmm9,%xmm2,%xmm1 - VFMADD231SS_ %xmm10,%xmm3,%xmm0 - VFMADD231SS_ %xmm11,%xmm3,%xmm1 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) addq $4 , BI addq $2, %rax .endm @@ -388,12 +380,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm8,%xmm2,%xmm0 - VFMADD231SS_ %xmm10,%xmm3,%xmm0 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) addq $4 , BI addq $1, %rax .endm @@ -436,10 +428,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 - VFMADD231PS_ %ymm7,%ymm3,%ymm1 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) addq $2 , BI addq $16, %rax .endm @@ -480,8 +472,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm6,%ymm3,%ymm0 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) addq $2 , BI addq $8 , %rax .endm @@ -513,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 - VFMADD231PS_ %xmm6,%xmm3,%xmm0 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) addq $2 , BI addq $4 , %rax .endm @@ -546,10 +538,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 - VFMADD231SS_ %xmm7,%xmm3,%xmm1 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) addq $2 , BI addq $2, %rax .endm @@ -589,8 +581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm6,%xmm3,%xmm0 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) addq $2 , BI addq $1, %rax .endm @@ -625,8 +617,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 - VFMADD231PS_ %ymm5,%ymm2,%ymm1 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) addq $1 , BI addq $16, %rax .endm @@ -656,7 +648,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_ %ymm4,%ymm2,%ymm0 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) addq $1 , BI addq $8 , %rax .endm @@ -684,7 +676,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231PS_ %xmm4,%xmm2,%xmm0 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) addq $1 , BI addq $4 , %rax .endm @@ -712,8 +704,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 - VFMADD231SS_ %xmm5,%xmm2,%xmm1 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) addq $1 , BI addq $2, %rax .endm @@ -743,7 +735,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_ %xmm4,%xmm2,%xmm0 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) addq $1 , BI addq $1, %rax .endm diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index 949f90bea..1e6278466 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /******************************************************************************** -* 2013/10/28 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -137,43 +137,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPD_R y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPD_R y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPD_R y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #else -.macro VFMADDPD_R y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #endif @@ -181,43 +165,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) -.macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -.macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -.macro VFMADDPD_R y0,y1,y2 - vfmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #else -.macro VFMADDPD_R y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 -.macro VFMADDPD_I y0,y1,y2 - vfnmadd231pd \y1,\y2,\y0 -.endm +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #endif @@ -233,16 +201,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPD_R %ymm8 ,%ymm4,%ymm0 - VFMADDPD_R %ymm12,%ymm4,%ymm1 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPD_I %ymm9 ,%ymm5,%ymm0 - VFMADDPD_I %ymm13,%ymm5,%ymm1 + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPD_R %ymm10,%ymm6,%ymm0 - VFMADDPD_R %ymm14,%ymm6,%ymm1 - VFMADDPD_I %ymm11,%ymm7,%ymm0 - VFMADDPD_I %ymm15,%ymm7,%ymm1 + VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) addq $4, BI addq $8, %rax @@ -337,17 +305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R %xmm8,%xmm4,%xmm0 - VFMADDPD_R %xmm12,%xmm4,%xmm1 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I %xmm9,%xmm5,%xmm0 - VFMADDPD_I %xmm13,%xmm5,%xmm1 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPD_R %xmm10,%xmm6,%xmm0 - VFMADDPD_R %xmm14,%xmm6,%xmm1 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_I %xmm11,%xmm7,%xmm0 - VFMADDPD_I %xmm15,%xmm7,%xmm1 + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) addq $4, BI addq $4, %rax .endm @@ -441,12 +409,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_R %xmm8,%xmm4,%xmm0 - VFMADDPD_I %xmm9,%xmm5,%xmm0 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_R %xmm10,%xmm6,%xmm0 - VFMADDPD_I %xmm11,%xmm7,%xmm0 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) addq $4, BI addq $2, %rax .endm @@ -513,10 +481,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 - VFMADDPD_R %ymm8 ,%ymm4,%ymm0 - VFMADDPD_R %ymm12,%ymm4,%ymm1 - VFMADDPD_I %ymm9 ,%ymm5,%ymm0 - VFMADDPD_I %ymm13,%ymm5,%ymm1 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) addq $2, BI addq $8, %rax @@ -585,12 +553,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R %xmm8,%xmm4,%xmm0 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R %xmm12,%xmm4,%xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I %xmm9,%xmm5,%xmm0 - VFMADDPD_I %xmm13,%xmm5,%xmm1 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) addq $2, BI addq $4, %rax .endm @@ -655,9 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R %xmm8,%xmm4,%xmm0 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I %xmm9,%xmm5,%xmm0 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) addq $2, BI addq $2, %rax .endm From 6e679266f8c0ec59cae54ffc86520d4e0c9e316f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Nov 2013 18:32:42 +0100 Subject: [PATCH 054/127] changes for compatibility with Pathscale compiler --- common_x86.h | 15 ++++++++++++++- common_x86_64.h | 10 ++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/common_x86.h b/common_x86.h index 48517d900..8245f7078 100644 --- a/common_x86.h +++ b/common_x86.h @@ -301,12 +301,25 @@ REALNAME: #define PROFCODE #endif + +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits #endif + + +#endif + #ifdef XDOUBLE #define FLD fldt #define FST fstpt diff --git a/common_x86_64.h b/common_x86_64.h index 188903848..4fe23448f 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,10 +372,20 @@ REALNAME: #define PROFCODE #endif +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits +#endif + #endif From 6f4a0ebe38c1772e946d874e00a1b3ce3f83cb6f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 14 Nov 2013 13:52:47 +0100 Subject: [PATCH 055/127] added max- und min-kernels for all precisions --- kernel/arm/KERNEL.ARMV7 | 48 ++-- kernel/arm/iamax_vfpv3.S | 478 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 502 insertions(+), 24 deletions(-) create mode 100644 kernel/arm/iamax_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 10db665b2..1a8c6a3e5 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,34 +1,34 @@ -SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c -CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c +SAMAXKERNEL = iamax_vfpv3.S +DAMAXKERNEL = iamax_vfpv3.S +CAMAXKERNEL = iamax_vfpv3.S +ZAMAXKERNEL = iamax_vfpv3.S -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +SAMINKERNEL = iamax_vfpv3.S +DAMINKERNEL = iamax_vfpv3.S +CAMINKERNEL = iamax_vfpv3.S +ZAMINKERNEL = iamax_vfpv3.S -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c +SMAXKERNEL = iamax_vfpv3.S +DMAXKERNEL = iamax_vfpv3.S -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c +SMINKERNEL = iamax_vfpv3.S +DMINKERNEL = iamax_vfpv3.S -ISAMAXKERNEL = ../arm/iamax.c -IDAMAXKERNEL = ../arm/iamax.c -ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = ../arm/izamax.c +ISAMAXKERNEL = iamax_vfpv3.S +IDAMAXKERNEL = iamax_vfpv3.S +ICAMAXKERNEL = iamax_vfpv3.S +IZAMAXKERNEL = iamax_vfpv3.S -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c +ISAMINKERNEL = iamax_vfpv3.S +IDAMINKERNEL = iamax_vfpv3.S +ICAMINKERNEL = iamax_vfpv3.S +IZAMINKERNEL = iamax_vfpv3.S -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c +ISMAXKERNEL = iamax_vfpv3.S +IDMAXKERNEL = iamax_vfpv3.S -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c +ISMINKERNEL = iamax_vfpv3.S +IDMINKERNEL = iamax_vfpv3.S SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c diff --git a/kernel/arm/iamax_vfpv3.S b/kernel/arm/iamax_vfpv3.S new file mode 100644 index 000000000..1d7344898 --- /dev/null +++ b/kernel/arm/iamax_vfpv3.S @@ -0,0 +1,478 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define INDEX r3 +#define Z r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(USE_ABS) + +#if defined(DOUBLE) + +#define VABS(x0,x1) vabs.f64 x0, x1 + +#else + +#define VABS(x0,x1) vabs.f32 x0, x1 + +#endif + +#else + +#define VABS(x0,x1) nop + +#endif + +/*****************************************************************************************/ + +#if defined(USE_MIN) + +#define MOVCOND movlt + +#if defined(DOUBLE) + +#define VMOVCOND vmovlt.f64 + +#else + +#define VMOVCOND vmovlt.f32 + +#endif + +#else + +#define MOVCOND movgt + +#if defined(DOUBLE) + +#define VMOVCOND vmovgt.f64 + +#else + +#define VMOVCOND vmovgt.f32 + +#endif + + +#endif + + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4} + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 +#else + vsub.f32 s0 , s0 , s0 +#endif + mov INDEX, #0 + + cmp N, #0 + ble iamax_kernel_L999 + + cmp INC_X, #0 + beq iamax_kernel_L999 + + + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + + +iamax_kernel_F_BEGIN: + + INIT_F + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_F1 + + .align 5 + +iamax_kernel_F4: + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + ble iamax_kernel_F1 + + +#if defined(COMPLEX) || defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F4 + +iamax_kernel_F1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + INIT_S + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_S1 + + .align 5 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + + +iamax_kernel_L999: + + mov r0, INDEX // set return value + + pop {r4} + bx lr + + EPILOGUE + From 3dabd7e6e60161635e25985ad62d595e57d4ce00 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 14 Nov 2013 19:06:19 +0100 Subject: [PATCH 056/127] added swap-kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/swap_vfpv3.S | 354 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 358 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/swap_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 1a8c6a3e5..8118c5330 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -30,10 +30,10 @@ IDMAXKERNEL = iamax_vfpv3.S ISMINKERNEL = iamax_vfpv3.S IDMINKERNEL = iamax_vfpv3.S -SSWAPKERNEL = ../arm/swap.c -DSWAPKERNEL = ../arm/swap.c -CSWAPKERNEL = ../arm/zswap.c -ZSWAPKERNEL = ../arm/zswap.c +SSWAPKERNEL = swap_vfpv3.S +DSWAPKERNEL = swap_vfpv3.S +CSWAPKERNEL = swap_vfpv3.S +ZSWAPKERNEL = swap_vfpv3.S SASUMKERNEL = asum_vfpv3.S DASUMKERNEL = asum_vfpv3.S diff --git a/kernel/arm/swap_vfpv3.S b/kernel/arm/swap_vfpv3.S new file mode 100644 index 000000000..352875188 --- /dev/null +++ b/kernel/arm/swap_vfpv3.S @@ -0,0 +1,354 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y!, { d0 } + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y, { d0 } + fstmiad X, { d4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y!, { s0 } + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y, { s0 } + fstmias X, { s4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + +.macro KERNEL_F1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y!, { d0 - d1 } + fstmiad X!, { d4 - d5 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y, { d0 - d1 } + fstmiad X, { d4 - d5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + +.macro KERNEL_F1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y!, { s0 - s1 } + fstmias X!, { s4 - s5 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y, { s0 - s1 } + fstmias X, { s4 - s5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble swap_kernel_L999 + + cmp INC_X, #0 + beq swap_kernel_L999 + + cmp INC_Y, #0 + beq swap_kernel_L999 + + cmp INC_X, #1 + bne swap_kernel_S_BEGIN + + cmp INC_Y, #1 + bne swap_kernel_S_BEGIN + + +swap_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_F1 + + .align 5 + +swap_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble swap_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne swap_kernel_F4 + +swap_kernel_F1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne swap_kernel_F10 + + b swap_kernel_L999 + +swap_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_S1 + + .align 5 + +swap_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S4 + +swap_kernel_S1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S10 + + +swap_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From f1b452e160c9954d40b50e00962db311bccace3c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 15 Nov 2013 11:56:43 +0100 Subject: [PATCH 057/127] added scal kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/scal_vfpv3.S | 376 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 380 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/scal_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 8118c5330..4e1939574 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -65,10 +65,10 @@ DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c -SSCALKERNEL = ../arm/scal.c -DSCALKERNEL = ../arm/scal.c -CSCALKERNEL = ../arm/zscal.c -ZSCALKERNEL = ../arm/zscal.c +SSCALKERNEL = scal_vfpv3.S +DSCALKERNEL = scal_vfpv3.S +CSCALKERNEL = scal_vfpv3.S +ZSCALKERNEL = scal_vfpv3.S SGEMVNKERNEL = gemv_n.c DGEMVNKERNEL = gemv_n.c diff --git a/kernel/arm/scal_vfpv3.S b/kernel/arm/scal_vfpv3.S new file mode 100644 index 000000000..a04b7241e --- /dev/null +++ b/kernel/arm/scal_vfpv3.S @@ -0,0 +1,376 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [sp, #0 ] + + +#define N r0 +#define INC_X r1 +#define X r3 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X, { d4 - d7 } + vmul.f64 d4, d4, d0 + vmul.f64 d5, d5, d0 + vmul.f64 d6, d6, d0 + fstmiad X!, { d4 - d5 } + vmul.f64 d7, d7, d0 + fstmiad X!, { d6 - d7 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X, { d4 } + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 - s7 } + vmul.f32 s4, s4, s0 + vmul.f32 s5, s5, s0 + vmul.f32 s6, s6, s0 + fstmias X!, { s4 - s5 } + vmul.f32 s7, s7, s0 + fstmias X!, { s6 - s7 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X, { s4 } + add X, X, INC_X + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X, { d2 - d3 } + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X, { s2 - s3 } + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr INC_X , OLD_INC_X + + cmp N, #0 + ble scal_kernel_L999 + + cmp INC_X, #0 + ble scal_kernel_L999 + + cmp INC_X, #1 + bne scal_kernel_S_BEGIN + + +scal_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_F1 + + .align 5 + +scal_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble scal_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne scal_kernel_F4 + +scal_kernel_F1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne scal_kernel_F10 + + b scal_kernel_L999 + +scal_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_S1 + + .align 5 + +scal_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S4 + +scal_kernel_S1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S10 + + +scal_kernel_L999: + + mov r0, #0 // set return value + + bx lr + + EPILOGUE + From 23dd474cd080f5e1beda6925f184eda9ed133764 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 15 Nov 2013 14:08:57 +0100 Subject: [PATCH 058/127] added rot kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/rot_vfpv3.S | 584 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 588 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/rot_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4e1939574..d349e2aa5 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -60,10 +60,10 @@ DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c -SROTKERNEL = ../arm/rot.c -DROTKERNEL = ../arm/rot.c -CROTKERNEL = ../arm/zrot.c -ZROTKERNEL = ../arm/zrot.c +SROTKERNEL = rot_vfpv3.S +DROTKERNEL = rot_vfpv3.S +CROTKERNEL = rot_vfpv3.S +ZROTKERNEL = rot_vfpv3.S SSCALKERNEL = scal_vfpv3.S DSCALKERNEL = scal_vfpv3.S diff --git a/kernel/arm/rot_vfpv3.S b/kernel/arm/rot_vfpv3.S new file mode 100644 index 000000000..663ecdf81 --- /dev/null +++ b/kernel/arm/rot_vfpv3.S @@ -0,0 +1,584 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_Y [fp, #0 ] + + +#define N r0 +#define X r1 +#define INC_X r2 +#define Y r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X, { d2 } + fstmiad Y, { d3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X, { s2 } + fstmias Y, { s3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + vstr d2 , [ X, #0 ] + vstr d3 , [ Y, #0 ] + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + vstr d2 , [ X, #8 ] + vstr d3 , [ Y, #8 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + vstr s2 , [ X, #0 ] + vstr s3 , [ Y, #0 ] + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + vstr s2 , [ X, #4 ] + vstr s3 , [ Y, #4 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble rot_kernel_L999 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + + +rot_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_F1 + + .align 5 + +rot_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble rot_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + b rot_kernel_L999 + +rot_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_S1 + + .align 5 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + + +rot_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From f27cabfd082d2f4a98d2a555ea27863b33ae8999 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 16 Nov 2013 16:17:17 +0100 Subject: [PATCH 059/127] added nrm2 kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/nrm2_vfpv3.S | 508 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 512 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/nrm2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d349e2aa5..c5aeef560 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -55,10 +55,10 @@ DDOTKERNEL = ddot_vfpv3.S CDOTKERNEL = cdot_vfpv3.S ZDOTKERNEL = zdot_vfpv3.S -SNRM2KERNEL = ../arm/nrm2.c -DNRM2KERNEL = ../arm/nrm2.c -CNRM2KERNEL = ../arm/znrm2.c -ZNRM2KERNEL = ../arm/znrm2.c +SNRM2KERNEL = nrm2_vfpv3.S +DNRM2KERNEL = nrm2_vfpv3.S +CNRM2KERNEL = nrm2_vfpv3.S +ZNRM2KERNEL = nrm2_vfpv3.S SROTKERNEL = rot_vfpv3.S DROTKERNEL = rot_vfpv3.S diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S new file mode 100644 index 000000000..b56f8b038 --- /dev/null +++ b/kernel/arm/nrm2_vfpv3.S @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vmov.f64 d1 , #1.0 // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vmov.f32 s1 , #1.0 // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + From 86283c0be1e5292cbe656826e453223bf751be15 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 19 Nov 2013 09:55:54 +0100 Subject: [PATCH 060/127] added gemv_t kernel for single and double precision --- kernel/arm/KERNEL.ARMV7 | 4 +- kernel/arm/gemv_t_vfpv3.S | 732 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 734 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_t_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index c5aeef560..d3ddfac10 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -75,8 +75,8 @@ DGEMVNKERNEL = gemv_n.c CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c -SGEMVTKERNEL = gemv_t.c -DGEMVTKERNEL = gemv_t.c +SGEMVTKERNEL = gemv_t_vfpv3.S +DGEMVTKERNEL = gemv_t_vfpv3.S CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S new file mode 100644 index 000000000..7ae5799bc --- /dev/null +++ b/kernel/arm/gemv_t_vfpv3.S @@ -0,0 +1,732 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F2 + + vsub.f64 d4 , d4 , d4 + vsub.f64 d5 , d5 , d5 + +.endm + +.macro KERNEL_F2X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d28 - d31 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d16 - d17 } + vmla.f64 d4 , d28 , d8 + vmla.f64 d5 , d28 , d16 + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + vmla.f64 d5 , d29 , d17 + fldmiad AO2!, { d18 - d19 } + vmla.f64 d4 , d30, d10 + vmla.f64 d5 , d30, d18 + vmla.f64 d4 , d31, d11 + vmla.f64 d5 , d31, d19 + +.endm + + +.macro KERNEL_F2X1 + + fldmiad XO! , { d2 } + fldmiad AO1!, { d8 } + fldmiad AO2!, { d16 } + vmla.f64 d4 , d2 , d8 + vmla.f64 d5 , d2 , d16 + +.endm + +.macro SAVE_F2 + + fldmiad YO, { d24 - d25 } + vmla.f64 d24, d0, d4 + vmla.f64 d25, d0, d5 + fstmiad YO!, { d24 - d25 } + +.endm + +.macro INIT_S2 + + vsub.f64 d4 , d4 , d4 + vsub.f64 d5 , d5 , d5 + +.endm + +.macro KERNEL_S2X4 + + pld [ AO1 , #A_PRE ] + fldmiad XO , { d28 } + add XO, XO, INC_X + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d16 - d17 } + vmla.f64 d4 , d28 , d8 + fldmiad XO , { d29 } + add XO, XO, INC_X + vmla.f64 d5 , d28 , d16 + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + fldmiad XO , { d30 } + add XO, XO, INC_X + vmla.f64 d5 , d29 , d17 + fldmiad AO2!, { d18 - d19 } + vmla.f64 d4 , d30, d10 + fldmiad XO , { d31 } + add XO, XO, INC_X + vmla.f64 d5 , d30, d18 + vmla.f64 d4 , d31, d11 + vmla.f64 d5 , d31, d19 + +.endm + + +.macro KERNEL_S2X1 + + fldmiad XO , { d2 } + fldmiad AO1!, { d8 } + add XO, XO, INC_X + fldmiad AO2!, { d16 } + vmla.f64 d4 , d2 , d8 + vmla.f64 d5 , d2 , d16 + +.endm + +.macro SAVE_S2 + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d4 + fstmiad YO, { d24 } + add YO, YO, INC_Y + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d5 + fstmiad YO, { d24 } + add YO, YO, INC_Y + +.endm + +.macro INIT_F1 + + vsub.f64 d4 , d4 , d4 + +.endm + +.macro KERNEL_F1X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d28 - d31 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + vmla.f64 d4 , d28 , d8 + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + vmla.f64 d4 , d30, d10 + vmla.f64 d4 , d31, d11 + +.endm + + +.macro KERNEL_F1X1 + + fldmiad XO! , { d2 } + fldmiad AO1!, { d8 } + vmla.f64 d4 , d2 , d8 + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d4 + fstmiad YO!, { d24 } + +.endm + +.macro INIT_S1 + + vsub.f64 d4 , d4 , d4 + +.endm + +.macro KERNEL_S1X4 + + pld [ AO1 , #A_PRE ] + fldmiad XO , { d28 } + add XO, XO, INC_X + fldmiad AO1!, { d8 - d9 } + vmla.f64 d4 , d28 , d8 + fldmiad XO , { d29 } + add XO, XO, INC_X + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + fldmiad XO , { d30 } + add XO, XO, INC_X + vmla.f64 d4 , d30, d10 + fldmiad XO , { d31 } + add XO, XO, INC_X + vmla.f64 d4 , d31, d11 + +.endm + + +.macro KERNEL_S1X1 + + fldmiad XO , { d2 } + fldmiad AO1!, { d8 } + add XO, XO, INC_X + vmla.f64 d4 , d2 , d8 + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d4 + fstmiad YO, { d24 } + add YO, YO, INC_Y + +.endm + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F2 + + vsub.f32 s4 , s4 , s4 + vsub.f32 s5 , s5 , s5 + +.endm + +.macro KERNEL_F2X4 + + fldmias XO! , { s28 - s31 } + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s16 - s17 } + vmla.f32 s4 , s28 , s8 + vmla.f32 s5 , s28 , s16 + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + vmla.f32 s5 , s29 , s17 + fldmias AO2!, { s18 - s19 } + vmla.f32 s4 , s30, s10 + vmla.f32 s5 , s30, s18 + vmla.f32 s4 , s31, s11 + vmla.f32 s5 , s31, s19 + +.endm + + +.macro KERNEL_F2X1 + + fldmias XO! , { s2 } + fldmias AO1!, { s8 } + fldmias AO2!, { s16 } + vmla.f32 s4 , s2 , s8 + vmla.f32 s5 , s2 , s16 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s24 - s25 } + vmla.f32 s24, s0, s4 + vmla.f32 s25, s0, s5 + fstmias YO!, { s24 - s25 } + +.endm + +.macro INIT_S2 + + vsub.f32 s4 , s4 , s4 + vsub.f32 s5 , s5 , s5 + +.endm + +.macro KERNEL_S2X4 + + fldmias XO , { s28 } + add XO, XO, INC_X + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s16 - s17 } + vmla.f32 s4 , s28 , s8 + fldmias XO , { s29 } + add XO, XO, INC_X + vmla.f32 s5 , s28 , s16 + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + fldmias XO , { s30 } + add XO, XO, INC_X + vmla.f32 s5 , s29 , s17 + fldmias AO2!, { s18 - s19 } + vmla.f32 s4 , s30, s10 + fldmias XO , { s31 } + add XO, XO, INC_X + vmla.f32 s5 , s30, s18 + vmla.f32 s4 , s31, s11 + vmla.f32 s5 , s31, s19 + +.endm + + +.macro KERNEL_S2X1 + + fldmias XO , { s2 } + fldmias AO1!, { s8 } + add XO, XO, INC_X + fldmias AO2!, { s16 } + vmla.f32 s4 , s2 , s8 + vmla.f32 s5 , s2 , s16 + +.endm + +.macro SAVE_S2 + + fldmias YO, { s24 } + vmla.f32 s24, s0, s4 + fstmias YO, { s24 } + add YO, YO, INC_Y + + fldmias YO, { s24 } + vmla.f32 s24, s0, s5 + fstmias YO, { s24 } + add YO, YO, INC_Y + +.endm + +.macro INIT_F1 + + vsub.f32 s4 , s4 , s4 + +.endm + +.macro KERNEL_F1X4 + + fldmias XO! , { s28 - s31 } + fldmias AO1!, { s8 - s9 } + vmla.f32 s4 , s28 , s8 + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + vmla.f32 s4 , s30, s10 + vmla.f32 s4 , s31, s11 + +.endm + + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 } + fldmias AO1!, { s8 } + vmla.f32 s4 , s2 , s8 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s24 } + vmla.f32 s24, s0, s4 + fstmias YO!, { s24 } + +.endm + +.macro INIT_S1 + + vsub.f32 s4 , s4 , s4 + +.endm + +.macro KERNEL_S1X4 + + fldmias XO , { s28 } + add XO, XO, INC_X + fldmias AO1!, { s8 - s9 } + vmla.f32 s4 , s28 , s8 + fldmias XO , { s29 } + add XO, XO, INC_X + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + fldmias XO , { s30 } + add XO, XO, INC_X + vmla.f32 s4 , s30, s10 + fldmias XO , { s31 } + add XO, XO, INC_X + vmla.f32 s4 , s31, s11 + +.endm + + +.macro KERNEL_S1X1 + + fldmias XO , { s2 } + fldmias AO1!, { s8 } + add XO, XO, INC_X + vmla.f32 s4 , s2 , s8 + +.endm + +.macro SAVE_S1 + + fldmias YO, { s24 } + vmla.f32 s24, s0, s4 + fstmias YO, { s24 } + add YO, YO, INC_Y + +.endm + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s31 } // store floating point registers +#endif + + cmp M, #0 + ble gemvt_kernel_L999 + + cmp OLD_N, #0 + ble gemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvt_kernel_L999 + + cmp INC_Y, #0 + beq gemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne gemvt_kernel_S2_BEGIN + + +gemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_F1_BEGIN + +gemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F2X1 + + +gemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne gemvt_kernel_F2X4_10 + + +gemvt_kernel_F2X1: + + ands I, M , #3 + ble gemvt_kernel_F2_END + +gemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne gemvt_kernel_F2X1_10 + + +gemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne gemvt_kernel_F2X4 + + +gemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F1X1 + + +gemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne gemvt_kernel_F1X4_10 + + +gemvt_kernel_F1X1: + + ands I, M , #3 + ble gemvt_kernel_F1_END + +gemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne gemvt_kernel_F1X1_10 + + +gemvt_kernel_F1_END: + + SAVE_F1 + + b gemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_S1_BEGIN + +gemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S2X1 + + +gemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne gemvt_kernel_S2X4_10 + + +gemvt_kernel_S2X1: + + ands I, M , #3 + ble gemvt_kernel_S2_END + +gemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne gemvt_kernel_S2X1_10 + + +gemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne gemvt_kernel_S2X4 + + +gemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S1X1 + + +gemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne gemvt_kernel_S1X4_10 + + +gemvt_kernel_S1X1: + + ands I, M , #3 + ble gemvt_kernel_S1_END + +gemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne gemvt_kernel_S1X1_10 + + +gemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +gemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s31 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From bf04544902fdd6588fa6e43a772f03c24e96069e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 19 Nov 2013 15:07:20 +0100 Subject: [PATCH 061/127] added gemv_n kernel for single and double precision --- kernel/arm/KERNEL.ARMV7 | 4 +- kernel/arm/gemv_n_vfpv3.S | 781 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 783 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_n_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d3ddfac10..8d6acbe98 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -70,8 +70,8 @@ DSCALKERNEL = scal_vfpv3.S CSCALKERNEL = scal_vfpv3.S ZSCALKERNEL = scal_vfpv3.S -SGEMVNKERNEL = gemv_n.c -DGEMVNKERNEL = gemv_n.c +SGEMVNKERNEL = gemv_n_vfpv3.S +DGEMVNKERNEL = gemv_n_vfpv3.S CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S new file mode 100644 index 000000000..e031c331e --- /dev/null +++ b/kernel/arm/gemv_n_vfpv3.S @@ -0,0 +1,781 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/19 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F8 + + pld [ YO , #Y_PRE ] + pld [ YO , #Y_PRE+32 ] + + vsub.f64 d24 , d24 , d24 + vmov.f64 d25 , d24 + vmov.f64 d26 , d24 + vmov.f64 d27 , d24 + vmov.f64 d28 , d24 + vmov.f64 d29 , d24 + vmov.f64 d30 , d24 + vmov.f64 d31 , d24 + +.endm + +.macro KERNEL_F8X8 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + +.endm + + +.macro KERNEL_F8X1 + + fldmiad XO! , { d4 } + fldmiad AO1 , { d8 - d15 } + + vmla.f64 d24 , d4 , d8 + pld [ AO2 , #A_PRE ] + vmla.f64 d25 , d4 , d9 + pld [ AO2 , #A_PRE+32 ] + vmla.f64 d26 , d4 , d10 + vmla.f64 d27 , d4 , d11 + vmla.f64 d28 , d4 , d12 + vmla.f64 d29 , d4 , d13 + add AO1, AO1, LDA + vmla.f64 d30 , d4 , d14 + add AO2, AO2, LDA + vmla.f64 d31 , d4 , d15 + +.endm + +.macro SAVE_F8 + + fldmiad YO, { d16 - d23 } + + vmla.f64 d16, d0, d24 + vmla.f64 d17, d0, d25 + vmla.f64 d18, d0, d26 + vmla.f64 d19, d0, d27 + vmla.f64 d20, d0, d28 + vmla.f64 d21, d0, d29 + vmla.f64 d22, d0, d30 + vmla.f64 d23, d0, d31 + + fstmiad YO!, { d16 - d23 } + +.endm + + +.macro INIT_F1 + + vsub.f64 d24 , d24 , d24 + +.endm + + + +.macro KERNEL_F1X1 + + fldmiad XO! , { d4 } + fldmiad AO1 , { d8 } + vmla.f64 d24 , d4 , d8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d16 } + vmla.f64 d16, d0, d24 + fstmiad YO!, { d16 } + +.endm + +/*********************************************************************************************/ + + +.macro INIT_S8 + + vsub.f64 d24 , d24 , d24 + vmov.f64 d25 , d24 + vmov.f64 d26 , d24 + vmov.f64 d27 , d24 + vmov.f64 d28 , d24 + vmov.f64 d29 , d24 + vmov.f64 d30 , d24 + vmov.f64 d31 , d24 + +.endm + +.macro KERNEL_S8X8 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + +.endm + + +.macro KERNEL_S8X1 + + pld [ AO2 , #A_PRE ] + pld [ AO2 , #A_PRE+32 ] + fldmiad XO , { d4 } + fldmiad AO1 , { d8 - d15 } + + vmla.f64 d24 , d4 , d8 + vmla.f64 d25 , d4 , d9 + vmla.f64 d26 , d4 , d10 + vmla.f64 d27 , d4 , d11 + vmla.f64 d28 , d4 , d12 + vmla.f64 d29 , d4 , d13 + vmla.f64 d30 , d4 , d14 + vmla.f64 d31 , d4 , d15 + add AO1, AO1, LDA + add AO2, AO2, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S8 + + fldmiad YO, { d16 } + vmla.f64 d16, d0, d24 + fstmiad YO, { d16 } + add YO, YO, INC_Y + + fldmiad YO, { d17 } + vmla.f64 d17, d0, d25 + fstmiad YO, { d17 } + add YO, YO, INC_Y + + fldmiad YO, { d18 } + vmla.f64 d18, d0, d26 + fstmiad YO, { d18 } + add YO, YO, INC_Y + + fldmiad YO, { d19 } + vmla.f64 d19, d0, d27 + fstmiad YO, { d19 } + add YO, YO, INC_Y + + fldmiad YO, { d20 } + vmla.f64 d20, d0, d28 + fstmiad YO, { d20 } + add YO, YO, INC_Y + + fldmiad YO, { d21 } + vmla.f64 d21, d0, d29 + fstmiad YO, { d21 } + add YO, YO, INC_Y + + fldmiad YO, { d22 } + vmla.f64 d22, d0, d30 + fstmiad YO, { d22 } + add YO, YO, INC_Y + + fldmiad YO, { d23 } + vmla.f64 d23, d0, d31 + fstmiad YO, { d23 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f64 d24 , d24 , d24 + +.endm + + + +.macro KERNEL_S1X1 + + fldmiad XO , { d4 } + fldmiad AO1 , { d8 } + vmla.f64 d24 , d4 , d8 + add AO1, AO1, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d16 } + vmla.f64 d16, d0, d24 + fstmiad YO, { d16 } + add YO, YO, INC_Y + +.endm + + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F8 + + pld [ YO , #Y_PRE ] + + vsub.f32 s24 , s24 , s24 + vmov.f32 s25 , s24 + vmov.f32 s26 , s24 + vmov.f32 s27 , s24 + vmov.f32 s28 , s24 + vmov.f32 s29 , s24 + vmov.f32 s30 , s24 + vmov.f32 s31 , s24 + +.endm + +.macro KERNEL_F8X8 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + +.endm + + +.macro KERNEL_F8X1 + + pld [ AO2 , #A_PRE ] + fldmias XO! , { s4 } + fldmias AO1 , { s8 - s15 } + + vmla.f32 s24 , s4 , s8 + vmla.f32 s25 , s4 , s9 + vmla.f32 s26 , s4 , s10 + vmla.f32 s27 , s4 , s11 + vmla.f32 s28 , s4 , s12 + vmla.f32 s29 , s4 , s13 + vmla.f32 s30 , s4 , s14 + vmla.f32 s31 , s4 , s15 + add AO1, AO1, LDA + add AO2, AO2, LDA + +.endm + +.macro SAVE_F8 + + fldmias YO, { s16 - s23 } + + vmla.f32 s16, s0, s24 + vmla.f32 s17, s0, s25 + vmla.f32 s18, s0, s26 + vmla.f32 s19, s0, s27 + vmla.f32 s20, s0, s28 + vmla.f32 s21, s0, s29 + vmla.f32 s22, s0, s30 + vmla.f32 s23, s0, s31 + + fstmias YO!, { s16 - s23 } + +.endm + + +.macro INIT_F1 + + vsub.f32 s24 , s24 , s24 + +.endm + + + +.macro KERNEL_F1X1 + + fldmias XO! , { s4 } + fldmias AO1 , { s8 } + vmla.f32 s24 , s4 , s8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmias YO, { s16 } + vmla.f32 s16, s0, s24 + fstmias YO!, { s16 } + +.endm + +/*********************************************************************************************/ + + +.macro INIT_S8 + + vsub.f32 s24 , s24 , s24 + vmov.f32 s25 , s24 + vmov.f32 s26 , s24 + vmov.f32 s27 , s24 + vmov.f32 s28 , s24 + vmov.f32 s29 , s24 + vmov.f32 s30 , s24 + vmov.f32 s31 , s24 + +.endm + +.macro KERNEL_S8X8 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + +.endm + + +.macro KERNEL_S8X1 + + pld [ AO2 , #A_PRE ] + fldmias XO , { s4 } + fldmias AO1 , { s8 - s15 } + + vmla.f32 s24 , s4 , s8 + vmla.f32 s25 , s4 , s9 + vmla.f32 s26 , s4 , s10 + vmla.f32 s27 , s4 , s11 + vmla.f32 s28 , s4 , s12 + vmla.f32 s29 , s4 , s13 + vmla.f32 s30 , s4 , s14 + vmla.f32 s31 , s4 , s15 + add AO1, AO1, LDA + add AO2, AO2, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S8 + + fldmias YO, { s16 } + vmla.f32 s16, s0, s24 + fstmias YO, { s16 } + add YO, YO, INC_Y + + fldmias YO, { s17 } + vmla.f32 s17, s0, s25 + fstmias YO, { s17 } + add YO, YO, INC_Y + + fldmias YO, { s18 } + vmla.f32 s18, s0, s26 + fstmias YO, { s18 } + add YO, YO, INC_Y + + fldmias YO, { s19 } + vmla.f32 s19, s0, s27 + fstmias YO, { s19 } + add YO, YO, INC_Y + + fldmias YO, { s20 } + vmla.f32 s20, s0, s28 + fstmias YO, { s20 } + add YO, YO, INC_Y + + fldmias YO, { s21 } + vmla.f32 s21, s0, s29 + fstmias YO, { s21 } + add YO, YO, INC_Y + + fldmias YO, { s22 } + vmla.f32 s22, s0, s30 + fstmias YO, { s22 } + add YO, YO, INC_Y + + fldmias YO, { s23 } + vmla.f32 s23, s0, s31 + fstmias YO, { s23 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f32 s24 , s24 , s24 + +.endm + + + +.macro KERNEL_S1X1 + + fldmias XO , { s4 } + fldmias AO1 , { s8 } + vmla.f32 s24 , s4 , s8 + add AO1, AO1, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s16 } + vmla.f32 s16, s0, s24 + fstmias YO, { s16 } + add YO, YO, INC_Y + +.endm + + + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s31 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble gemvn_kernel_L999 + + cmp N, #0 + ble gemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvn_kernel_L999 + + cmp INC_Y, #0 + beq gemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvn_kernel_S8_BEGIN + + cmp INC_Y, #1 + bne gemvn_kernel_S8_BEGIN + + +gemvn_kernel_F8_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #3 // I = M / 8 + ble gemvn_kernel_F1_BEGIN + +gemvn_kernel_F8X8: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #8*SIZE + str r3 , A + + ldr XO , X + + INIT_F8 + + asrs J, N, #3 // J = N / 8 + ble gemvn_kernel_F8X1 + + +gemvn_kernel_F8X8_10: + + KERNEL_F8X8 + + subs J, J, #1 + bne gemvn_kernel_F8X8_10 + + +gemvn_kernel_F8X1: + + ands J, N , #7 + ble gemvn_kernel_F8_END + +gemvn_kernel_F8X1_10: + + KERNEL_F8X1 + + subs J, J, #1 + bne gemvn_kernel_F8X1_10 + + +gemvn_kernel_F8_END: + + SAVE_F8 + + subs I , I , #1 + bne gemvn_kernel_F8X8 + + +gemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #7 + ble gemvn_kernel_L999 + +gemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +gemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne gemvn_kernel_F1X1_10 + + +gemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne gemvn_kernel_F1X1 + + b gemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvn_kernel_S8_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #3 // I = M / 8 + ble gemvn_kernel_S1_BEGIN + +gemvn_kernel_S8X8: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #8*SIZE + str r3 , A + + ldr XO , X + + INIT_S8 + + asrs J, N, #3 // J = N / 8 + ble gemvn_kernel_S8X1 + + +gemvn_kernel_S8X8_10: + + KERNEL_S8X8 + + subs J, J, #1 + bne gemvn_kernel_S8X8_10 + + +gemvn_kernel_S8X1: + + ands J, N , #7 + ble gemvn_kernel_S8_END + +gemvn_kernel_S8X1_10: + + KERNEL_S8X1 + + subs J, J, #1 + bne gemvn_kernel_S8X1_10 + + +gemvn_kernel_S8_END: + + SAVE_S8 + + subs I , I , #1 + bne gemvn_kernel_S8X8 + + +gemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #7 + ble gemvn_kernel_L999 + +gemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +gemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne gemvn_kernel_S1X1_10 + + +gemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne gemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +gemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s31 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From 410afda9b44e1920bf3f7b6bdcd852da0f4c4da9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 20:18:51 +0100 Subject: [PATCH 062/127] added cpu detection and target ARMV6, used in raspberry pi --- Makefile.arm | 9 +- cpuid_arm.c | 262 ++++++++++++++++++++++++++++++++++++++++ getarch.c | 39 +++++- kernel/arm/KERNEL.ARMV6 | 134 ++++++++++++++++++++ param.h | 40 ++++++ 5 files changed, 477 insertions(+), 7 deletions(-) create mode 100644 cpuid_arm.c create mode 100644 kernel/arm/KERNEL.ARMV6 diff --git a/Makefile.arm b/Makefile.arm index 6cdeb2f75..8502d5286 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,12 @@ ifeq ($(CORE), ARMV7) -CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +endif + +ifeq ($(CORE), ARMV6) +CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 endif diff --git a/cpuid_arm.c b/cpuid_arm.c new file mode 100644 index 000000000..efd1369b4 --- /dev/null +++ b/cpuid_arm.c @@ -0,0 +1,262 @@ +/************************************************************************** + Copyright (c) 2013, The OpenBLAS Project + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_ARMV6 1 +#define CPU_ARMV7 2 +#define CPU_CORTEXA15 3 + +static char *cpuname[] = { + "UNKOWN", + "ARMV6", + "ARMV7", + "CORTEXA15" +}; + + +int get_feature(char *search) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + + +int detect(void) +{ + +#ifdef linux + + FILE *infile; + char buffer[512], *p; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("model name", buffer, 10)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if(p != NULL) + { + + if (strstr(p, "ARMv7")) + { + if ( get_feature("vfpv4")) + return CPU_ARMV7; + + if ( get_feature("vfpv3")) + return CPU_ARMV7; + + if ( get_feature("vfp")) + return CPU_ARMV6; + + + } + + if (strstr(p, "ARMv6")) + { + if ( get_feature("vfp")) + return CPU_ARMV6; + } + + + } +#endif + + return CPU_UNKNOWN; +} + +char *get_corename(void) +{ + return cpuname[detect()]; +} + +void get_architecture(void) +{ + printf("ARM"); +} + +void get_subarchitecture(void) +{ + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("ARMV7"); + break; + + case CPU_ARMV6: + printf("ARMV6"); + break; + + default: + printf("UNKNOWN"); + break; + } +} + +void get_subdirname(void) +{ + printf("arm"); +} + +void get_cpuconfig(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("#define ARMV7\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + case CPU_ARMV6: + printf("#define ARMV6\n"); + printf("#define HAVE_VFP\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + } +} + + +void get_libname(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("armv7\n"); + break; + + case CPU_ARMV6: + printf("armv6\n"); + break; + + } +} + + +void get_features(void) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; } + if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; } + if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; } + if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; } + } + +#endif + return; +} + + diff --git a/getarch.c b/getarch.c index 3264a76f6..4407e3d9b 100644 --- a/getarch.c +++ b/getarch.c @@ -687,23 +687,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV7 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP" #define LIBNAME "armv7" #define CORENAME "ARMV7" #else #endif +#ifdef FORCE_ARMV6 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV6" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV6 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP" +#define LIBNAME "armv6" +#define CORENAME "ARMV6" +#else +#endif + + #ifndef FORCE #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ - defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) + defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) + #ifndef POWER #define POWER #endif #define OPENBLAS_SUPPORTED #endif + #if defined(__i386__) || (__x86_64__) #include "cpuid_x86.c" #define OPENBLAS_SUPPORTED @@ -734,12 +753,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __arm__ +#include "cpuid_arm.c" +#define OPENBLAS_SUPPORTED +#endif + + #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif -#else - #endif static int get_num_cores(void) { @@ -788,7 +811,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -803,6 +826,12 @@ int main(int argc, char *argv[]){ printf("NUM_CORES=%d\n", get_num_cores()); +#if defined(__arm__) && !defined(FORCE) + get_features(); +#endif + + + #if defined(__i386__) || defined(__x86_64__) #ifndef FORCE get_sse(); diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 new file mode 100644 index 000000000..e379347c0 --- /dev/null +++ b/kernel/arm/KERNEL.ARMV6 @@ -0,0 +1,134 @@ +SAMAXKERNEL = amax.c +DAMAXKERNEL = amax.c +CAMAXKERNEL = zamax.c +ZAMAXKERNEL = zamax.c + +SAMINKERNEL = amin.c +DAMINKERNEL = amin.c +CAMINKERNEL = zamin.c +ZAMINKERNEL = zamin.c + +SMAXKERNEL = max.c +DMAXKERNEL = max.c + +SMINKERNEL = min.c +DMINKERNEL = min.c + +ISAMAXKERNEL = iamax.c +IDAMAXKERNEL = iamax.c +ICAMAXKERNEL = izamax.c +IZAMAXKERNEL = izamax.c + +ISAMINKERNEL = iamin.c +IDAMINKERNEL = iamin.c +ICAMINKERNEL = izamin.c +IZAMINKERNEL = izamin.c + +ISMAXKERNEL = imax.c +IDMAXKERNEL = imax.c + +ISMINKERNEL = imin.c +IDMINKERNEL = imin.c + +SASUMKERNEL = asum.c +DASUMKERNEL = asum.c +CASUMKERNEL = zasum.c +ZASUMKERNEL = zasum.c + +SAXPYKERNEL = axpy.c +DAXPYKERNEL = axpy.c +CAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy.c + +SCOPYKERNEL = copy.c +DCOPYKERNEL = copy.c +CCOPYKERNEL = zcopy.c +ZCOPYKERNEL = zcopy.c + +SDOTKERNEL = dot.c +DDOTKERNEL = dot.c +CDOTKERNEL = zdot.c +ZDOTKERNEL = zdot.c + +SNRM2KERNEL = nrm2.c +DNRM2KERNEL = nrm2.c +CNRM2KERNEL = znrm2.c +ZNRM2KERNEL = znrm2.c + +SROTKERNEL = rot.c +DROTKERNEL = rot.c +CROTKERNEL = zrot.c +ZROTKERNEL = zrot.c + +SSCALKERNEL = scal.c +DSCALKERNEL = scal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c + +SSWAPKERNEL = swap.c +DSWAPKERNEL = swap.c +CSWAPKERNEL = zswap.c +ZSWAPKERNEL = zswap.c + +SGEMVNKERNEL = gemv_n.c +DGEMVNKERNEL = gemv_n.c +CGEMVNKERNEL = zgemv_n.c +ZGEMVNKERNEL = zgemv_n.c + +SGEMVTKERNEL = gemv_t.c +DGEMVTKERNEL = gemv_t.c +CGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/param.h b/param.h index ab0ed91b7..7bb27f3ab 100644 --- a/param.h +++ b/param.h @@ -1831,6 +1831,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + #define SYMV_P 16 #endif From f1be3a168a4d54d9e6cd40a19587aa0bca9913c1 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 20:48:57 +0100 Subject: [PATCH 063/127] renamed some BLAS kernels, which are compatible to ARMV6 --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{ccopy_vfpv3.S => ccopy_vfp.S} | 0 kernel/arm/{dcopy_vfpv3.S => dcopy_vfp.S} | 0 kernel/arm/{scopy_vfpv3.S => scopy_vfp.S} | 0 kernel/arm/{zcopy_vfpv3.S => zcopy_vfp.S} | 0 6 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{ccopy_vfpv3.S => ccopy_vfp.S} (100%) rename kernel/arm/{dcopy_vfpv3.S => dcopy_vfp.S} (100%) rename kernel/arm/{scopy_vfpv3.S => scopy_vfp.S} (100%) rename kernel/arm/{zcopy_vfpv3.S => zcopy_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e379347c0..f745724c1 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -40,10 +40,10 @@ DAXPYKERNEL = axpy.c CAXPYKERNEL = zaxpy.c ZAXPYKERNEL = zaxpy.c -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = dot.c DDOTKERNEL = dot.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 8d6acbe98..48f0a72a9 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -45,10 +45,10 @@ DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c -SCOPYKERNEL = scopy_vfpv3.S -DCOPYKERNEL = dcopy_vfpv3.S -CCOPYKERNEL = ccopy_vfpv3.S -ZCOPYKERNEL = zcopy_vfpv3.S +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = sdot_vfpv3.S DDOTKERNEL = ddot_vfpv3.S diff --git a/kernel/arm/ccopy_vfpv3.S b/kernel/arm/ccopy_vfp.S similarity index 100% rename from kernel/arm/ccopy_vfpv3.S rename to kernel/arm/ccopy_vfp.S diff --git a/kernel/arm/dcopy_vfpv3.S b/kernel/arm/dcopy_vfp.S similarity index 100% rename from kernel/arm/dcopy_vfpv3.S rename to kernel/arm/dcopy_vfp.S diff --git a/kernel/arm/scopy_vfpv3.S b/kernel/arm/scopy_vfp.S similarity index 100% rename from kernel/arm/scopy_vfpv3.S rename to kernel/arm/scopy_vfp.S diff --git a/kernel/arm/zcopy_vfpv3.S b/kernel/arm/zcopy_vfp.S similarity index 100% rename from kernel/arm/zcopy_vfpv3.S rename to kernel/arm/zcopy_vfp.S From 29a005c63572779a7a7ef81d6909f6177f8da5b0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 21:12:33 +0100 Subject: [PATCH 064/127] renamed iamax assembler kernel --- kernel/arm/KERNEL.ARMV6 | 48 +++++++++++------------ kernel/arm/KERNEL.ARMV7 | 48 +++++++++++------------ kernel/arm/{iamax_vfpv3.S => iamax_vfp.S} | 0 3 files changed, 48 insertions(+), 48 deletions(-) rename kernel/arm/{iamax_vfpv3.S => iamax_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index f745724c1..a7bddbd82 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,34 +1,34 @@ -SAMAXKERNEL = amax.c -DAMAXKERNEL = amax.c -CAMAXKERNEL = zamax.c -ZAMAXKERNEL = zamax.c +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S -SAMINKERNEL = amin.c -DAMINKERNEL = amin.c -CAMINKERNEL = zamin.c -ZAMINKERNEL = zamin.c +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S -SMAXKERNEL = max.c -DMAXKERNEL = max.c +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S -SMINKERNEL = min.c -DMINKERNEL = min.c +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S -ISAMAXKERNEL = iamax.c -IDAMAXKERNEL = iamax.c -ICAMAXKERNEL = izamax.c -IZAMAXKERNEL = izamax.c +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S -ISAMINKERNEL = iamin.c -IDAMINKERNEL = iamin.c -ICAMINKERNEL = izamin.c -IZAMINKERNEL = izamin.c +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S -ISMAXKERNEL = imax.c -IDMAXKERNEL = imax.c +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S -ISMINKERNEL = imin.c -IDMINKERNEL = imin.c +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S SASUMKERNEL = asum.c DASUMKERNEL = asum.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 48f0a72a9..dafc4c32a 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,34 +1,34 @@ -SAMAXKERNEL = iamax_vfpv3.S -DAMAXKERNEL = iamax_vfpv3.S -CAMAXKERNEL = iamax_vfpv3.S -ZAMAXKERNEL = iamax_vfpv3.S +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S -SAMINKERNEL = iamax_vfpv3.S -DAMINKERNEL = iamax_vfpv3.S -CAMINKERNEL = iamax_vfpv3.S -ZAMINKERNEL = iamax_vfpv3.S +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S -SMAXKERNEL = iamax_vfpv3.S -DMAXKERNEL = iamax_vfpv3.S +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S -SMINKERNEL = iamax_vfpv3.S -DMINKERNEL = iamax_vfpv3.S +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S -ISAMAXKERNEL = iamax_vfpv3.S -IDAMAXKERNEL = iamax_vfpv3.S -ICAMAXKERNEL = iamax_vfpv3.S -IZAMAXKERNEL = iamax_vfpv3.S +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S -ISAMINKERNEL = iamax_vfpv3.S -IDAMINKERNEL = iamax_vfpv3.S -ICAMINKERNEL = iamax_vfpv3.S -IZAMINKERNEL = iamax_vfpv3.S +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S -ISMAXKERNEL = iamax_vfpv3.S -IDMAXKERNEL = iamax_vfpv3.S +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S -ISMINKERNEL = iamax_vfpv3.S -IDMINKERNEL = iamax_vfpv3.S +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S SSWAPKERNEL = swap_vfpv3.S DSWAPKERNEL = swap_vfpv3.S diff --git a/kernel/arm/iamax_vfpv3.S b/kernel/arm/iamax_vfp.S similarity index 100% rename from kernel/arm/iamax_vfpv3.S rename to kernel/arm/iamax_vfp.S From 5bf7cf8d670593487ea0915ca02be2815d35ba47 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:03:36 +0100 Subject: [PATCH 065/127] renamed scal_vfpv3.S to scal_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{scal_vfpv3.S => scal_vfp.S} | 0 3 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{scal_vfpv3.S => scal_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index a7bddbd82..8f3533544 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -60,10 +60,10 @@ DROTKERNEL = rot.c CROTKERNEL = zrot.c ZROTKERNEL = zrot.c -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S SSWAPKERNEL = swap.c DSWAPKERNEL = swap.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index dafc4c32a..a36cbc1ee 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -65,10 +65,10 @@ DROTKERNEL = rot_vfpv3.S CROTKERNEL = rot_vfpv3.S ZROTKERNEL = rot_vfpv3.S -SSCALKERNEL = scal_vfpv3.S -DSCALKERNEL = scal_vfpv3.S -CSCALKERNEL = scal_vfpv3.S -ZSCALKERNEL = scal_vfpv3.S +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S diff --git a/kernel/arm/scal_vfpv3.S b/kernel/arm/scal_vfp.S similarity index 100% rename from kernel/arm/scal_vfpv3.S rename to kernel/arm/scal_vfp.S From 8565afb3c2ac144307f3e67b91889d3d8aa5f1a6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:26:27 +0100 Subject: [PATCH 066/127] renamed asum_vfpv3.S to asum_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{asum_vfpv3.S => asum_vfp.S} | 0 3 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{asum_vfpv3.S => asum_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 8f3533544..b35eeb28a 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -30,10 +30,10 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S -SASUMKERNEL = asum.c -DASUMKERNEL = asum.c -CASUMKERNEL = zasum.c -ZASUMKERNEL = zasum.c +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S SAXPYKERNEL = axpy.c DAXPYKERNEL = axpy.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index a36cbc1ee..4a652d2db 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -35,10 +35,10 @@ DSWAPKERNEL = swap_vfpv3.S CSWAPKERNEL = swap_vfpv3.S ZSWAPKERNEL = swap_vfpv3.S -SASUMKERNEL = asum_vfpv3.S -DASUMKERNEL = asum_vfpv3.S -CASUMKERNEL = asum_vfpv3.S -ZASUMKERNEL = asum_vfpv3.S +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c diff --git a/kernel/arm/asum_vfpv3.S b/kernel/arm/asum_vfp.S similarity index 100% rename from kernel/arm/asum_vfpv3.S rename to kernel/arm/asum_vfp.S From cd93cae5a7a71edf3d5e0ac13e05bdf77a4e5a60 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:49:28 +0100 Subject: [PATCH 067/127] renamed rot_vfpv3.S to rot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/rot_vfp.S | 584 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 592 insertions(+), 8 deletions(-) create mode 100644 kernel/arm/rot_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b35eeb28a..fd294a937 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -55,10 +55,10 @@ DNRM2KERNEL = nrm2.c CNRM2KERNEL = znrm2.c ZNRM2KERNEL = znrm2.c -SROTKERNEL = rot.c -DROTKERNEL = rot.c -CROTKERNEL = zrot.c -ZROTKERNEL = zrot.c +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S SSCALKERNEL = scal_vfp.S DSCALKERNEL = scal_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4a652d2db..b03c709e0 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -60,10 +60,10 @@ DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S -SROTKERNEL = rot_vfpv3.S -DROTKERNEL = rot_vfpv3.S -CROTKERNEL = rot_vfpv3.S -ZROTKERNEL = rot_vfpv3.S +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S SSCALKERNEL = scal_vfp.S DSCALKERNEL = scal_vfp.S diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S new file mode 100644 index 000000000..663ecdf81 --- /dev/null +++ b/kernel/arm/rot_vfp.S @@ -0,0 +1,584 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_Y [fp, #0 ] + + +#define N r0 +#define X r1 +#define INC_X r2 +#define Y r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X, { d2 } + fstmiad Y, { d3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X, { s2 } + fstmias Y, { s3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + vstr d2 , [ X, #0 ] + vstr d3 , [ Y, #0 ] + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + vstr d2 , [ X, #8 ] + vstr d3 , [ Y, #8 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + vstr s2 , [ X, #0 ] + vstr s3 , [ Y, #0 ] + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + vstr s2 , [ X, #4 ] + vstr s3 , [ Y, #4 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble rot_kernel_L999 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + + +rot_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_F1 + + .align 5 + +rot_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble rot_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + b rot_kernel_L999 + +rot_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_S1 + + .align 5 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + + +rot_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From 440db4cddae9007eed93276d135e2d73fa959793 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:52:24 +0100 Subject: [PATCH 068/127] delete rot_vfpv3.S --- kernel/arm/rot_vfpv3.S | 584 ----------------------------------------- 1 file changed, 584 deletions(-) delete mode 100644 kernel/arm/rot_vfpv3.S diff --git a/kernel/arm/rot_vfpv3.S b/kernel/arm/rot_vfpv3.S deleted file mode 100644 index 663ecdf81..000000000 --- a/kernel/arm/rot_vfpv3.S +++ /dev/null @@ -1,584 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/11/15 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 256 - -#define OLD_INC_Y [fp, #0 ] - - -#define N r0 -#define X r1 -#define INC_X r2 -#define Y r3 -#define INC_Y r4 - -#define I r12 - -#define X_PRE 512 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -/*****************************************************************************************/ - - - -#if !defined(COMPLEX) - -#if defined(DOUBLE) - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - -.endm - - -.macro KERNEL_F1 - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - -.endm - -.macro KERNEL_S1 - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X, { d2 } - fstmiad Y, { d3 } - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - -#else - -.macro KERNEL_F4 - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - -.endm - - -.macro KERNEL_F1 - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - -.endm - -.macro KERNEL_S1 - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X, { s2 } - fstmias Y, { s3 } - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - - -#endif - -#else - -#if defined(DOUBLE) - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - -.endm - - -.macro KERNEL_F1 - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - -.endm - -.macro KERNEL_S1 - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - vstr d2 , [ X, #0 ] - vstr d3 , [ Y, #0 ] - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - vstr d2 , [ X, #8 ] - vstr d3 , [ Y, #8 ] - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - - -#else - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - -.endm - - -.macro KERNEL_F1 - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - -.endm - -.macro KERNEL_S1 - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - vstr s2 , [ X, #0 ] - vstr s3 , [ Y, #0 ] - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - vstr s2 , [ X, #4 ] - vstr s3 , [ Y, #4 ] - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - -#endif - -#endif - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - push {r4 , fp} - add fp, sp, #8 - - ldr INC_Y , OLD_INC_Y - - - cmp N, #0 - ble rot_kernel_L999 - - cmp INC_X, #0 - beq rot_kernel_L999 - - cmp INC_Y, #0 - beq rot_kernel_L999 - - cmp INC_X, #1 - bne rot_kernel_S_BEGIN - - cmp INC_Y, #1 - bne rot_kernel_S_BEGIN - - -rot_kernel_F_BEGIN: - - - asrs I, N, #2 // I = N / 4 - ble rot_kernel_F1 - - .align 5 - -rot_kernel_F4: - -#if !defined(COMPLEX) && !defined(DOUBLE) - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] -#endif - - KERNEL_F4 - - subs I, I, #1 - ble rot_kernel_F1 - - KERNEL_F4 - - subs I, I, #1 - bne rot_kernel_F4 - -rot_kernel_F1: - - ands I, N, #3 - ble rot_kernel_L999 - -rot_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne rot_kernel_F10 - - b rot_kernel_L999 - -rot_kernel_S_BEGIN: - -#if defined(COMPLEX) - -#if defined(DOUBLE) - lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 - lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 -#else - lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 - lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 -#endif - -#else - -#if defined(DOUBLE) - lsl INC_X, INC_X, #3 // INC_X * SIZE - lsl INC_Y, INC_Y, #3 // INC_Y * SIZE -#else - lsl INC_X, INC_X, #2 // INC_X * SIZE - lsl INC_Y, INC_Y, #2 // INC_Y * SIZE -#endif - -#endif - - - asrs I, N, #2 // I = N / 4 - ble rot_kernel_S1 - - .align 5 - -rot_kernel_S4: - - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne rot_kernel_S4 - -rot_kernel_S1: - - ands I, N, #3 - ble rot_kernel_L999 - -rot_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne rot_kernel_S10 - - -rot_kernel_L999: - - mov r0, #0 // set return value - - sub sp, fp, #8 - pop {r4,fp} - bx lr - - EPILOGUE - From 9adf87495ef165b6019d94f2381bf7e077aa5a26 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:07:51 +0100 Subject: [PATCH 069/127] renamed some dot kernels --- kernel/arm/KERNEL.ARMV6 | 7 ++++--- kernel/arm/KERNEL.ARMV7 | 6 +++--- kernel/arm/{cdot_vfpv3.S => cdot_vfp.S} | 0 kernel/arm/{ddot_vfpv3.S => ddot_vfp.S} | 0 kernel/arm/{zdot_vfpv3.S => zdot_vfp.S} | 0 5 files changed, 7 insertions(+), 6 deletions(-) rename kernel/arm/{cdot_vfpv3.S => cdot_vfp.S} (100%) rename kernel/arm/{ddot_vfpv3.S => ddot_vfp.S} (100%) rename kernel/arm/{zdot_vfpv3.S => zdot_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index fd294a937..9e3483e3c 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -46,9 +46,10 @@ CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = dot.c -DDOTKERNEL = dot.c -CDOTKERNEL = zdot.c -ZDOTKERNEL = zdot.c +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + SNRM2KERNEL = nrm2.c DNRM2KERNEL = nrm2.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index b03c709e0..4a409e777 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -51,9 +51,9 @@ CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = sdot_vfpv3.S -DDOTKERNEL = ddot_vfpv3.S -CDOTKERNEL = cdot_vfpv3.S -ZDOTKERNEL = zdot_vfpv3.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S diff --git a/kernel/arm/cdot_vfpv3.S b/kernel/arm/cdot_vfp.S similarity index 100% rename from kernel/arm/cdot_vfpv3.S rename to kernel/arm/cdot_vfp.S diff --git a/kernel/arm/ddot_vfpv3.S b/kernel/arm/ddot_vfp.S similarity index 100% rename from kernel/arm/ddot_vfpv3.S rename to kernel/arm/ddot_vfp.S diff --git a/kernel/arm/zdot_vfpv3.S b/kernel/arm/zdot_vfp.S similarity index 100% rename from kernel/arm/zdot_vfpv3.S rename to kernel/arm/zdot_vfp.S From 19cd5c64a20da8e33c95d94d690f3afa1c254ee1 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:19:32 +0100 Subject: [PATCH 070/127] renamed swap_vfpv3.S to swap_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{swap_vfpv3.S => swap_vfp.S} | 0 3 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{swap_vfpv3.S => swap_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 9e3483e3c..59604031c 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -66,10 +66,10 @@ DSCALKERNEL = scal_vfp.S CSCALKERNEL = scal_vfp.S ZSCALKERNEL = scal_vfp.S -SSWAPKERNEL = swap.c -DSWAPKERNEL = swap.c -CSWAPKERNEL = zswap.c -ZSWAPKERNEL = zswap.c +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S SGEMVNKERNEL = gemv_n.c DGEMVNKERNEL = gemv_n.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4a409e777..bbd7e89dd 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -30,10 +30,10 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S -SSWAPKERNEL = swap_vfpv3.S -DSWAPKERNEL = swap_vfpv3.S -CSWAPKERNEL = swap_vfpv3.S -ZSWAPKERNEL = swap_vfpv3.S +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S SASUMKERNEL = asum_vfp.S DASUMKERNEL = asum_vfp.S diff --git a/kernel/arm/swap_vfpv3.S b/kernel/arm/swap_vfp.S similarity index 100% rename from kernel/arm/swap_vfpv3.S rename to kernel/arm/swap_vfp.S From dbae93110baff205499853733e8fd47d1643c765 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:34:51 +0100 Subject: [PATCH 071/127] added sdot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/sdot_vfp.S | 347 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/sdot_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 59604031c..75f7a9491 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -45,7 +45,7 @@ DCOPYKERNEL = dcopy_vfp.S CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S -SDOTKERNEL = dot.c +SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S new file mode 100644 index 000000000..2d1909201 --- /dev/null +++ b/kernel/arm/sdot_vfp.S @@ -0,0 +1,347 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK (no test for dsdot) +* TEST : OK (no test for dsdot) +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(DSDOT) + +.macro KERNEL_F4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + nop + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + fldmias X!, { s8 - s9 } + fldmias Y!, { s4 - s5} + fmacs s0 , s4, s8 + fldmias X!, { s10 - s11 } + fmacs s1 , s5, s9 + fldmias Y!, { s6 - s7 } + fmacs s0 , s6, s10 + fmacs s1 , s7, s11 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y!, { s8 } + fmacs s0 , s4, s8 + +.endm + + +.macro KERNEL_S4 + + nop + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s4, s8 + + fldmias X, { s5 } + fldmias Y, { s9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s5, s9 + + fldmias X, { s6 } + fldmias Y, { s10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s6, s10 + + fldmias X, { s7 } + fldmias Y, { s11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s7, s11 + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + fmacs s0 , s4, s8 + add Y, Y, INC_Y + +.endm + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15 } // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + +#if defined(DSDOT) + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + +#else + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + +#endif + + cmp N, #0 + ble sdot_kernel_L999 + + cmp INC_X, #0 + beq sdot_kernel_L999 + + cmp INC_Y, #0 + beq sdot_kernel_L999 + + cmp INC_X, #1 + bne sdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne sdot_kernel_S_BEGIN + +sdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_F1 + +sdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne sdot_kernel_F4 + +sdot_kernel_F1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne sdot_kernel_F10 + + b sdot_kernel_L999 + +sdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_S1 + +sdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne sdot_kernel_S4 + +sdot_kernel_S1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne sdot_kernel_S10 + + + + + + +sdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if defined(DSDOT) + + vadd.f64 d0 , d0, d1 // set return value + +#else + + vadd.f32 s0 , s0, s1 // set return value + +#endif + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 9f0a3a35b3d1f4dce709bbc1aca348abe26880f0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:42:54 +0100 Subject: [PATCH 072/127] removed obsolete file sdot_vfpv3.S --- kernel/arm/sdot_vfpv3.S | 347 ---------------------------------------- 1 file changed, 347 deletions(-) delete mode 100644 kernel/arm/sdot_vfpv3.S diff --git a/kernel/arm/sdot_vfpv3.S b/kernel/arm/sdot_vfpv3.S deleted file mode 100644 index 794e07317..000000000 --- a/kernel/arm/sdot_vfpv3.S +++ /dev/null @@ -1,347 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/11/11 Saar -* BLASTEST : OK -* CTEST : OK (no test for dsdot) -* TEST : OK (no test for dsdot) -* -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 256 - -#define N r0 -#define X r1 -#define INC_X r2 -#define OLD_Y r3 - - -/****************************************************** -* [fp, #-128] - [fp, #-64] is reserved -* for store and restore of floating point -* registers -*******************************************************/ - -#define OLD_INC_Y [fp, #4 ] - -#define I r5 -#define Y r6 -#define INC_Y r7 - -#define X_PRE 512 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -#if defined(DSDOT) - -.macro KERNEL_F4 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - -.endm - -.macro KERNEL_F1 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - -.endm - - -.macro KERNEL_S4 - - nop - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - -.macro KERNEL_S1 - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - - -#else - -.macro KERNEL_F4 - - fldmias X!, { s8 - s9 } - fldmias Y!, { s4 - s5} - fmacs s0 , s4, s8 - fldmias X!, { s10 - s11 } - fmacs s1 , s5, s9 - fldmias Y!, { s6 - s7 } - fmacs s0 , s6, s10 - fmacs s1 , s7, s11 - -.endm - -.macro KERNEL_F1 - - fldmias X!, { s4 } - fldmias Y!, { s8 } - fmacs s0 , s4, s8 - -.endm - - -.macro KERNEL_S4 - - nop - fldmias X, { s4 } - fldmias Y, { s8 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s0 , s4, s8 - - fldmias X, { s5 } - fldmias Y, { s9 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s1 , s5, s9 - - fldmias X, { s6 } - fldmias Y, { s10 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s0 , s6, s10 - - fldmias X, { s7 } - fldmias Y, { s11 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s1 , s7, s11 - -.endm - - -.macro KERNEL_S1 - - fldmias X, { s4 } - fldmias Y, { s8 } - add X, X, INC_X - fmacs s0 , s4, s8 - add Y, Y, INC_Y - -.endm - -#endif - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - push {r4 - r9, fp} - add fp, sp, #24 - sub sp, sp, #STACKSIZE // reserve stack - - sub r4, fp, #128 - vstm r4, { s8 - s15 } // store floating point registers - - mov Y, OLD_Y - ldr INC_Y, OLD_INC_Y - -#if defined(DSDOT) - - vsub.f64 d0 , d0 , d0 - vsub.f64 d1 , d1 , d1 - -#else - - vsub.f32 s0 , s0 , s0 - vsub.f32 s1 , s1 , s1 - -#endif - - cmp N, #0 - ble sdot_kernel_L999 - - cmp INC_X, #0 - beq sdot_kernel_L999 - - cmp INC_Y, #0 - beq sdot_kernel_L999 - - cmp INC_X, #1 - bne sdot_kernel_S_BEGIN - - cmp INC_Y, #1 - bne sdot_kernel_S_BEGIN - -sdot_kernel_F_BEGIN: - - asrs I, N, #2 // I = N / 4 - ble sdot_kernel_F1 - -sdot_kernel_F4: - - KERNEL_F4 - - subs I, I, #1 - bne sdot_kernel_F4 - -sdot_kernel_F1: - - ands I, N, #3 - ble sdot_kernel_L999 - -sdot_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne sdot_kernel_F10 - - b sdot_kernel_L999 - -sdot_kernel_S_BEGIN: - - lsl INC_X, INC_X, #2 // INC_X * SIZE - lsl INC_Y, INC_Y, #2 // INC_Y * SIZE - - asrs I, N, #2 // I = N / 4 - ble sdot_kernel_S1 - -sdot_kernel_S4: - - KERNEL_S4 - - subs I, I, #1 - bne sdot_kernel_S4 - -sdot_kernel_S1: - - ands I, N, #3 - ble sdot_kernel_L999 - -sdot_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne sdot_kernel_S10 - - - - - - -sdot_kernel_L999: - - sub r3, fp, #128 - vldm r3, { s8 - s15} // restore floating point registers - -#if defined(DSDOT) - - vadd.f64 d0 , d0, d1 // set return value - -#else - - vadd.f32 s0 , s0, s1 // set return value - -#endif - sub sp, fp, #24 - pop {r4 - r9, fp} - bx lr - - EPILOGUE - From 7f210587f0e3cf3d25a346d94041b68fb39be40f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Nov 2013 00:20:25 +0100 Subject: [PATCH 073/127] renamed some ncopy and tcopy files --- kernel/arm/KERNEL.ARMV6 | 18 +++++++++--------- kernel/arm/KERNEL.ARMV7 | 18 +++++++++--------- ...emm_ncopy_2_vfpv3.S => cgemm_ncopy_2_vfp.S} | 0 ...emm_tcopy_2_vfpv3.S => cgemm_tcopy_2_vfp.S} | 0 ...emm_ncopy_4_vfpv3.S => dgemm_ncopy_4_vfp.S} | 0 ...emm_tcopy_4_vfpv3.S => dgemm_tcopy_4_vfp.S} | 0 ...emm_ncopy_4_vfpv3.S => sgemm_ncopy_4_vfp.S} | 0 ...emm_tcopy_4_vfpv3.S => sgemm_tcopy_4_vfp.S} | 0 ...emm_ncopy_2_vfpv3.S => zgemm_ncopy_2_vfp.S} | 0 ...emm_tcopy_2_vfpv3.S => zgemm_tcopy_2_vfp.S} | 0 10 files changed, 18 insertions(+), 18 deletions(-) rename kernel/arm/{cgemm_ncopy_2_vfpv3.S => cgemm_ncopy_2_vfp.S} (100%) rename kernel/arm/{cgemm_tcopy_2_vfpv3.S => cgemm_tcopy_2_vfp.S} (100%) rename kernel/arm/{dgemm_ncopy_4_vfpv3.S => dgemm_ncopy_4_vfp.S} (100%) rename kernel/arm/{dgemm_tcopy_4_vfpv3.S => dgemm_tcopy_4_vfp.S} (100%) rename kernel/arm/{sgemm_ncopy_4_vfpv3.S => sgemm_ncopy_4_vfp.S} (100%) rename kernel/arm/{sgemm_tcopy_4_vfpv3.S => sgemm_tcopy_4_vfp.S} (100%) rename kernel/arm/{zgemm_ncopy_2_vfpv3.S => zgemm_ncopy_2_vfp.S} (100%) rename kernel/arm/{zgemm_tcopy_2_vfpv3.S => zgemm_tcopy_2_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 75f7a9491..17a4f9257 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -92,23 +92,23 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index bbd7e89dd..07d38d189 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -50,7 +50,7 @@ DCOPYKERNEL = dcopy_vfp.S CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S -SDOTKERNEL = sdot_vfpv3.S +SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S @@ -89,8 +89,8 @@ ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMINCOPY = SGEMMITCOPY = -SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S -SGEMMOTCOPY = sgemm_tcopy_4_vfpv3.S +SGEMMONCOPY = sgemm_ncopy_4_vfp.S +SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o @@ -99,22 +99,22 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMINCOPY = DGEMMITCOPY = -DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S -DGEMMOTCOPY = dgemm_tcopy_4_vfpv3.S +DGEMMONCOPY = dgemm_ncopy_4_vfp.S +DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = cgemm_ncopy_2_vfpv3.S -CGEMMOTCOPY = cgemm_tcopy_2_vfpv3.S +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = zgemm_ncopy_2_vfpv3.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfpv3.S +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/arm/cgemm_ncopy_2_vfpv3.S b/kernel/arm/cgemm_ncopy_2_vfp.S similarity index 100% rename from kernel/arm/cgemm_ncopy_2_vfpv3.S rename to kernel/arm/cgemm_ncopy_2_vfp.S diff --git a/kernel/arm/cgemm_tcopy_2_vfpv3.S b/kernel/arm/cgemm_tcopy_2_vfp.S similarity index 100% rename from kernel/arm/cgemm_tcopy_2_vfpv3.S rename to kernel/arm/cgemm_tcopy_2_vfp.S diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfp.S similarity index 100% rename from kernel/arm/dgemm_ncopy_4_vfpv3.S rename to kernel/arm/dgemm_ncopy_4_vfp.S diff --git a/kernel/arm/dgemm_tcopy_4_vfpv3.S b/kernel/arm/dgemm_tcopy_4_vfp.S similarity index 100% rename from kernel/arm/dgemm_tcopy_4_vfpv3.S rename to kernel/arm/dgemm_tcopy_4_vfp.S diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfp.S similarity index 100% rename from kernel/arm/sgemm_ncopy_4_vfpv3.S rename to kernel/arm/sgemm_ncopy_4_vfp.S diff --git a/kernel/arm/sgemm_tcopy_4_vfpv3.S b/kernel/arm/sgemm_tcopy_4_vfp.S similarity index 100% rename from kernel/arm/sgemm_tcopy_4_vfpv3.S rename to kernel/arm/sgemm_tcopy_4_vfp.S diff --git a/kernel/arm/zgemm_ncopy_2_vfpv3.S b/kernel/arm/zgemm_ncopy_2_vfp.S similarity index 100% rename from kernel/arm/zgemm_ncopy_2_vfpv3.S rename to kernel/arm/zgemm_ncopy_2_vfp.S diff --git a/kernel/arm/zgemm_tcopy_2_vfpv3.S b/kernel/arm/zgemm_tcopy_2_vfp.S similarity index 100% rename from kernel/arm/zgemm_tcopy_2_vfpv3.S rename to kernel/arm/zgemm_tcopy_2_vfp.S From 9a0f9789296e764c5fe92c614c7cc9d655061bb3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Nov 2013 17:21:10 +0100 Subject: [PATCH 074/127] added nrm2 kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 9 +- kernel/arm/nrm2_vfp.S | 565 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 569 insertions(+), 5 deletions(-) create mode 100644 kernel/arm/nrm2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 17a4f9257..03eff6ec5 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -50,11 +50,10 @@ DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S - -SNRM2KERNEL = nrm2.c -DNRM2KERNEL = nrm2.c -CNRM2KERNEL = znrm2.c -ZNRM2KERNEL = znrm2.c +SNRM2KERNEL = nrm2_vfp.S +DNRM2KERNEL = nrm2_vfp.S +CNRM2KERNEL = nrm2_vfp.S +ZNRM2KERNEL = nrm2_vfp.S SROTKERNEL = rot_vfp.S DROTKERNEL = rot_vfp.S diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S new file mode 100644 index 000000000..4c62917b9 --- /dev/null +++ b/kernel/arm/nrm2_vfp.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/22 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + b nrm2_begin + + +#if defined(COMPLEX) + +#if defined(DOUBLE) + +znrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +cnrm2_one: + .word 0x3f800000 + +#endif + +#else + +#if defined(DOUBLE) + +dnrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +snrm2_one: + .word 0x3f800000 + +#endif + +#endif + + + .align 5 + + +nrm2_begin: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , znrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , cnrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + +#else + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , dnrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , snrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + +#endif + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + From 33d3ab6e098e618a92a315e09a7a6e22a213ef87 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Nov 2013 12:35:31 +0100 Subject: [PATCH 075/127] small optimizations for zgemv kernels --- kernel/arm/zgemv_n.c | 62 +++++++++++++++++++++++++++++++++----------- kernel/arm/zgemv_t.c | 53 +++++++++++++++++++++---------------- 2 files changed, 78 insertions(+), 37 deletions(-) diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c index 5f00c34f6..dc2ffa0d2 100644 --- a/kernel/arm/zgemv_n.c +++ b/kernel/arm/zgemv_n.c @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** - * * 2013/09/15 Saar + * * 2013/11/23 Saar * * BLASTEST float : OK * * BLASTEST double : OK * CTEST : OK @@ -48,20 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2; BLASLONG i2; - if( alpha_r == 0.0 && alpha_i == 0.0 ) return(0); - lda2 = 2*lda; - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - ix = 0; a_ptr = a; -#if !defined(CONJ) - for (j=0; j Date: Sat, 23 Nov 2013 14:35:19 +0100 Subject: [PATCH 076/127] fixed bug in SAVE macros, that are not found by any test routine --- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 4 ++-- kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 4 ++-- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 4 ++-- kernel/arm/strmm_kernel_4x4_vfpv3.S | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index ed7f611f1..3b6af19a3 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -714,7 +714,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d12, d0 , d20 fstd d12, [CO2] - add CO1, CO1, #16 + add CO1, CO1, #8 .endm diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index a80177f8b..0f8a9291a 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/05 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -690,7 +690,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuld d12, d0 , d20 fstd d12, [CO2] - add CO1, CO1, #16 + add CO1, CO1, #8 .endm diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 4031c28db..38dc4d3ea 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/05 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -687,7 +687,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s12, s0 , s20 fsts s12, [CO2] - add CO1, CO1, #8 + add CO1, CO1, #4 .endm diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index 15c866856..3a0c8af87 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/14 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -627,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuls s12, s0 , s20 fsts s12, [CO2] - add CO1, CO1, #8 + add CO1, CO1, #4 .endm From 8776a73773a0cb3a0daf993120a5081cc5cac42c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Nov 2013 16:24:52 +0100 Subject: [PATCH 077/127] added optimized dgemm and dtrmm kernel for ARMV6 --- Makefile.rule | 2 +- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/dgemm_kernel_4x2_vfp.S | 797 +++++++++++++++++++++ kernel/arm/dtrmm_kernel_4x2_vfp.S | 1083 +++++++++++++++++++++++++++++ param.h | 2 +- 5 files changed, 1888 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/dgemm_kernel_4x2_vfp.S create mode 100644 kernel/arm/dtrmm_kernel_4x2_vfp.S diff --git a/Makefile.rule b/Makefile.rule index 534f4d1a2..c8433288b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -12,7 +12,7 @@ VERSION = 0.2.8 # You can specify the target architecture, otherwise it's # automatically detected. -TARGET = ARMV7 +TARGET = ARMV6 # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 03eff6ec5..7a58fecd3 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -81,7 +81,7 @@ CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c @@ -91,7 +91,11 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S +DGEMMINCOPY = ../generic/gemm_ncopy_4.c +DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..56fd81513 --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -0,0 +1,797 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + fmacd d14 , d2, d5 + fmacd d15 , d3, d5 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + fldd d6 , [CO1, #16 ] + fldd d7 , [CO1, #24 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + fmacd d6 , d0 , d10 + fmacd d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + fldd d6 , [CO2, #16 ] + fldd d7 , [CO2, #24 ] + + fmacd d4 , d0 , d12 + fmacd d5 , d0 , d13 + fmacd d6 , d0 , d14 + fmacd d7 , d0 , d15 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + + fmacd d4 , d0 , d12 + fmacd d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + fldd d4 , [CO2] + + fmacd d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + fldd d6 , [CO1, #16 ] + fldd d7 , [CO1, #24 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + fmacd d6 , d0 , d10 + fmacd d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble dgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +dgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +dgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt dgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble dgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..55a017a97 --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -0,0 +1,1083 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + fmacd d14 , d2, d5 + fmacd d15 , d3, d5 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + fmuld d6 , d0 , d14 + fmuld d7 , d0 , d15 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + + fmuld d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index 7bb27f3ab..568605bb8 100644 --- a/param.h +++ b/param.h @@ -1846,7 +1846,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 From 29a3196f56a6ad465d097580eb6c0c5556adbbcc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Nov 2013 18:09:41 +0100 Subject: [PATCH 078/127] added optimized sgemm and strmm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/sgemm_kernel_4x2_vfp.S | 798 +++++++++++++++++++++ kernel/arm/strmm_kernel_4x2_vfp.S | 1083 +++++++++++++++++++++++++++++ param.h | 2 +- 4 files changed, 1888 insertions(+), 3 deletions(-) create mode 100644 kernel/arm/sgemm_kernel_4x2_vfp.S create mode 100644 kernel/arm/strmm_kernel_4x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 7a58fecd3..b192b20da 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -80,12 +80,16 @@ DGEMVTKERNEL = gemv_t.c CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c +STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMINCOPY = ../generic/gemm_ncopy_4.c +SGEMMITCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..3e20f86f0 --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -0,0 +1,798 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + flds s6 , [CO2, #8 ] + flds s7 , [CO2, #12 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + fmacs s6 , s0 , s14 + fmacs s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + flds s4 , [CO2] + + fmacs s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble sgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +sgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt sgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble sgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +sgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..5394a6444 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -0,0 +1,1083 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + fmuls s6 , s0 , s14 + fmuls s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + + fmuls s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index 568605bb8..ec1767d20 100644 --- a/param.h +++ b/param.h @@ -1843,7 +1843,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 4 From 12e02a00e06b61e1825ac48d9071fce262042998 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 08:46:47 +0100 Subject: [PATCH 079/127] added ncopy kernels for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 14 +- kernel/arm/dgemm_ncopy_2_vfp.S | 225 +++++++++++++++++++++++++++++++++ kernel/arm/sgemm_ncopy_2_vfp.S | 225 +++++++++++++++++++++++++++++++++ 3 files changed, 457 insertions(+), 7 deletions(-) create mode 100644 kernel/arm/dgemm_ncopy_2_vfp.S create mode 100644 kernel/arm/sgemm_ncopy_2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b192b20da..1f2510bc3 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -85,22 +85,22 @@ DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = sgemm_kernel_4x2_vfp.S -SGEMMINCOPY = ../generic/gemm_ncopy_4.c -SGEMMITCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMONCOPY = sgemm_ncopy_2_vfp.S SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x2_vfp.S -DGEMMINCOPY = ../generic/gemm_ncopy_4.c -DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMONCOPY = dgemm_ncopy_2_vfp.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..763c032e1 --- /dev/null +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #3 // lda = lda * 8 + + ldr BO, B + + +/*********************************************************************************************/ + +dgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble dgemm_ncopy_L1_BEGIN + +dgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L2_M2_40 + +dgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_20 + + +dgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L2_M2_END + +dgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_60 + + +dgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne dgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +dgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble dgemm_ncopy_L999 + + +dgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L1_M2_40 + +dgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_20 + + +dgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L1_M2_END + +dgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_60 + + +dgemm_ncopy_L1_M2_END: + + + +dgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..0546f1d69 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #2 // lda = lda * 4 + + ldr BO, B + + +/*********************************************************************************************/ + +sgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble sgemm_ncopy_L1_BEGIN + +sgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L2_M2_40 + +sgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_20 + + +sgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L2_M2_END + +sgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_60 + + +sgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne sgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +sgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble sgemm_ncopy_L999 + + +sgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L1_M2_40 + +sgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_20 + + +sgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L1_M2_END + +sgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_60 + + +sgemm_ncopy_L1_M2_END: + + + +sgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 25c605059328127d0e2612e94c0bbcb52a4bff3a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 12:03:28 +0100 Subject: [PATCH 080/127] add single and double precision gemv_n kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 4 +- kernel/arm/gemv_n_vfp.S | 675 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 677 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_n_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1f2510bc3..9dad061d9 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -70,8 +70,8 @@ DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S -SGEMVNKERNEL = gemv_n.c -DGEMVNKERNEL = gemv_n.c +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S new file mode 100644 index 000000000..47265994c --- /dev/null +++ b/kernel/arm/gemv_n_vfp.S @@ -0,0 +1,675 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F4 + + pld [ YO , #Y_PRE ] + + vsub.f64 d12 , d12 , d12 + vmov.f64 d13 , d12 + vmov.f64 d14 , d12 + vmov.f64 d15 , d12 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO , #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + + +.macro KERNEL_F4X1 + + fldmiad XO! , { d2 } + fldmiad AO1 , { d8 - d11 } + + vmla.f64 d12 , d2 , d8 + pld [ AO2 , #A_PRE ] + vmla.f64 d13 , d2 , d9 + add AO1, AO1, LDA + vmla.f64 d14 , d2 , d10 + vmla.f64 d15 , d2 , d11 + add AO2, AO2, LDA + +.endm + +.macro SAVE_F4 + + fldmiad YO, { d4 - d7 } + + vmla.f64 d4 , d0, d12 + vmla.f64 d5 , d0, d13 + vmla.f64 d6 , d0, d14 + vmla.f64 d7 , d0, d15 + + fstmiad YO!, { d4 - d7 } + +.endm + + +.macro INIT_F1 + + vsub.f64 d12 , d12 , d12 + +.endm + + + +.macro KERNEL_F1X1 + + fldmiad XO! , { d2 } + fldmiad AO1 , { d8 } + vmla.f64 d12 , d2 , d8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d12 + fstmiad YO!, { d4 } + +.endm + +/*********************************************************************************************/ + +.macro INIT_S4 + + vsub.f64 d12 , d12 , d12 + vmov.f64 d13 , d12 + vmov.f64 d14 , d12 + vmov.f64 d15 , d12 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + + +.macro KERNEL_S4X1 + + pld [ AO2 , #A_PRE ] + fldmiad XO , { d2 } + fldmiad AO1 , { d8 - d11 } + + vmla.f64 d12 , d2 , d8 + add AO1, AO1, LDA + vmla.f64 d13 , d2 , d9 + add AO2, AO2, LDA + vmla.f64 d14 , d2 , d10 + vmla.f64 d15 , d2 , d11 + add XO, XO , INC_X + +.endm + +.macro SAVE_S4 + + fldmiad YO, { d4 } + vmla.f64 d4 , d0, d12 + fstmiad YO, { d4 } + add YO, YO, INC_Y + + fldmiad YO, { d5 } + vmla.f64 d5 , d0, d13 + fstmiad YO, { d5 } + add YO, YO, INC_Y + + fldmiad YO, { d4 } + vmla.f64 d4 , d0, d14 + fstmiad YO, { d4 } + add YO, YO, INC_Y + + fldmiad YO, { d5 } + vmla.f64 d5 , d0, d15 + fstmiad YO, { d5 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f64 d12 , d12 , d12 + +.endm + + + +.macro KERNEL_S1X1 + + fldmiad XO , { d2 } + fldmiad AO1 , { d8 } + vmla.f64 d12 , d2 , d8 + add AO1, AO1, LDA + add XO, XO , INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d12 + fstmiad YO , { d4 } + add YO, YO, INC_Y + +.endm + + + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F4 + + pld [ YO , #Y_PRE ] + + vsub.f32 s12 , s12 , s12 + vmov.f32 s13 , s12 + vmov.f32 s14 , s12 + vmov.f32 s15 , s12 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO , #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + + +.macro KERNEL_F4X1 + + fldmias XO! , { s2 } + fldmias AO1 , { s8 - s11 } + + vmla.f32 s12 , s2 , s8 + vmla.f32 s13 , s2 , s9 + vmla.f32 s14 , s2 , s10 + vmla.f32 s15 , s2 , s11 + add AO1, AO1, LDA + add AO2, AO2, LDA + +.endm + +.macro SAVE_F4 + + fldmias YO, { s4 - s7 } + + vmla.f32 s4 , s0, s12 + vmla.f32 s5 , s0, s13 + vmla.f32 s6 , s0, s14 + vmla.f32 s7 , s0, s15 + + fstmias YO!, { s4 - s7 } + +.endm + + +.macro INIT_F1 + + vsub.f32 s12 , s12 , s12 + +.endm + + + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 } + fldmias AO1 , { s8 } + vmla.f32 s12 , s2 , s8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s12 + fstmias YO!, { s4 } + +.endm + +/*********************************************************************************************/ + +.macro INIT_S4 + + vsub.f32 s12 , s12 , s12 + vmov.f32 s13 , s12 + vmov.f32 s14 , s12 + vmov.f32 s15 , s12 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + + +.macro KERNEL_S4X1 + + pld [ AO2 , #A_PRE ] + fldmias XO , { s2 } + fldmias AO1 , { s8 - s11 } + + vmla.f32 s12 , s2 , s8 + vmla.f32 s13 , s2 , s9 + vmla.f32 s14 , s2 , s10 + vmla.f32 s15 , s2 , s11 + add AO1, AO1, LDA + add AO2, AO2, LDA + add XO, XO , INC_X + +.endm + +.macro SAVE_S4 + + fldmias YO, { s4 } + vmla.f32 s4 , s0, s12 + fstmias YO, { s4 } + add YO, YO, INC_Y + + fldmias YO, { s5 } + vmla.f32 s5 , s0, s13 + fstmias YO, { s5 } + add YO, YO, INC_Y + + fldmias YO, { s4 } + vmla.f32 s4 , s0, s14 + fstmias YO, { s4 } + add YO, YO, INC_Y + + fldmias YO, { s5 } + vmla.f32 s5 , s0, s15 + fstmias YO, { s5 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f32 s12 , s12 , s12 + +.endm + + + +.macro KERNEL_S1X1 + + fldmias XO , { s2 } + fldmias AO1 , { s8 } + vmla.f32 s12 , s2 , s8 + add AO1, AO1, LDA + add XO, XO , INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s12 + fstmias YO , { s4 } + add YO, YO, INC_Y + +.endm + + + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble gemvn_kernel_L999 + + cmp N, #0 + ble gemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvn_kernel_L999 + + cmp INC_Y, #0 + beq gemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne gemvn_kernel_S4_BEGIN + + +gemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble gemvn_kernel_F1_BEGIN + +gemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #4*SIZE + str r3 , A + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble gemvn_kernel_F4X1 + + +gemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne gemvn_kernel_F4X4_10 + + +gemvn_kernel_F4X1: + + ands J, N , #3 + ble gemvn_kernel_F4_END + +gemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne gemvn_kernel_F4X1_10 + + +gemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne gemvn_kernel_F4X4 + + +gemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble gemvn_kernel_L999 + +gemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +gemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne gemvn_kernel_F1X1_10 + + +gemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne gemvn_kernel_F1X1 + + b gemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble gemvn_kernel_S1_BEGIN + +gemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #4*SIZE + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble gemvn_kernel_S4X1 + + +gemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne gemvn_kernel_S4X4_10 + + +gemvn_kernel_S4X1: + + ands J, N , #3 + ble gemvn_kernel_S4_END + +gemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne gemvn_kernel_S4X1_10 + + +gemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne gemvn_kernel_S4X4 + + +gemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble gemvn_kernel_L999 + +gemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +gemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne gemvn_kernel_S1X1_10 + + +gemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne gemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +gemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From e25de3d182097290b456d0a8707e9cc66949f24d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 13:22:49 +0100 Subject: [PATCH 081/127] changed default optimization flag for ARM from -O2 to -O3 --- Makefile.system | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index e5358f65b..0f5e9c6d5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -830,8 +830,12 @@ COMMON_OPT += -g endif ifndef COMMON_OPT +ifeq ($(ARCH), arm) +COMMON_OPT = -O3 +else COMMON_OPT = -O2 endif +endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) @@ -929,6 +933,10 @@ export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX +export HAVE_VFP +export HAVE_VFPV3 +export HAVE_VFPV4 +export HAVE_NEON export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE From fe5f46c3304946ac1c536d5edd593ebaf21f71d9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 15:47:00 +0100 Subject: [PATCH 082/127] added experimental support for ARMV8 --- Makefile.arm64 | 7 ++ Makefile.system | 8 ++ c_check | 2 + common.h | 6 +- common_arm64.h | 169 ++++++++++++++++++++++++++++++++++++ ctest.c | 3 + getarch.c | 16 ++++ kernel/Makefile.L3 | 4 + kernel/arm64/KERNEL | 46 ++++++++++ kernel/arm64/KERNEL.ARMV8 | 134 ++++++++++++++++++++++++++++ kernel/arm64/Makefile | 2 + lapack/laswp/arm64/Makefile | 33 +++++++ param.h | 40 +++++++++ 13 files changed, 469 insertions(+), 1 deletion(-) create mode 100644 Makefile.arm64 create mode 100644 common_arm64.h create mode 100644 kernel/arm64/KERNEL create mode 100644 kernel/arm64/KERNEL.ARMV8 create mode 100644 kernel/arm64/Makefile create mode 100644 lapack/laswp/arm64/Makefile diff --git a/Makefile.arm64 b/Makefile.arm64 new file mode 100644 index 000000000..a4f8bab6b --- /dev/null +++ b/Makefile.arm64 @@ -0,0 +1,7 @@ + +ifeq ($(CORE), ARMV8) +CCOMMON_OPT += -march=armv8-a +FCOMMON_OPT += -march=armv8-a +endif + + diff --git a/Makefile.system b/Makefile.system index 0f5e9c6d5..aceadf2b6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -367,6 +367,14 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif + +ifeq ($(ARCH), arm64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + + + # # C Compiler dependent settings # diff --git a/c_check b/c_check index c1cdd59c4..0828a5bba 100644 --- a/c_check +++ b/c_check @@ -64,6 +64,7 @@ $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $defined = 0; @@ -151,6 +152,7 @@ $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/common.h b/common.h index a2775520f..310fcad93 100644 --- a/common.h +++ b/common.h @@ -311,7 +311,7 @@ typedef int blasint; #endif -#ifdef ARMV7 +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #endif @@ -375,6 +375,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_arm.h" #endif +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + #ifdef OS_LINUX #include "common_linux.h" #endif diff --git a/common_arm64.h b/common_arm64.h new file mode 100644 index 000000000..2da0d894c --- /dev/null +++ b/common_arm64.h @@ -0,0 +1,169 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM64 +#define COMMON_ARM64 + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile BLASULONG *address){ +/* + int register ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); + + } while (ret); +*/ +} + + +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; + struct timeval tv; + gettimeofday(&tv,NULL); + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/ctest.c b/ctest.c index 184416339..86dc226d4 100644 --- a/ctest.c +++ b/ctest.c @@ -129,4 +129,7 @@ BINARY_64 ARCH_ARM #endif +#if defined(__aarch64__) +ARCH_ARM64 +#endif diff --git a/getarch.c b/getarch.c index 4407e3d9b..7975c9468 100644 --- a/getarch.c +++ b/getarch.c @@ -709,6 +709,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" +#define LIBNAME "armv8" +#define CORENAME "ARMV8" +#else +#endif + + #ifndef FORCE diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f543cd08d..b9b4bef1e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -18,6 +18,10 @@ ifeq ($(ARCH), arm) USE_TRMM = 1 endif +ifeq ($(ARCH), arm64) +USE_TRMM = 1 +endif + ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm64/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 new file mode 100644 index 000000000..ecf278cf9 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8 @@ -0,0 +1,134 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/arm64/Makefile b/kernel/arm64/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/lapack/laswp/arm64/Makefile b/lapack/laswp/arm64/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm64/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index ec1767d20..0628a1972 100644 --- a/param.h +++ b/param.h @@ -1874,6 +1874,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC From fd1d9fdb22955385c77a5743e5103543f5b43f96 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 16:19:01 +0100 Subject: [PATCH 083/127] changed default optimization from -O2 to -O3 for ARMV8 --- Makefile.system | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index aceadf2b6..5545de1b1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -840,11 +840,19 @@ endif ifndef COMMON_OPT ifeq ($(ARCH), arm) COMMON_OPT = -O3 -else -COMMON_OPT = -O2 endif endif +ifndef COMMON_OPT +ifeq ($(ARCH), arm64) +COMMON_OPT = -O3 +endif +endif + +ifndef COMMON_OPT +COMMON_OPT = -O2 +endif + override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) From d2b20c5c51b1f174d40dd8879b8a7387ce5a5416 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Nov 2013 12:25:58 +0100 Subject: [PATCH 084/127] add optimized axpy kernel --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/axpy_vfp.S | 503 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 511 insertions(+), 8 deletions(-) create mode 100644 kernel/arm/axpy_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 9dad061d9..79fb5f9c9 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -35,10 +35,10 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S -SAXPYKERNEL = axpy.c -DAXPYKERNEL = axpy.c -CAXPYKERNEL = zaxpy.c -ZAXPYKERNEL = zaxpy.c +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S SCOPYKERNEL = scopy_vfp.S DCOPYKERNEL = dcopy_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 07d38d189..143327074 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -40,10 +40,10 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S -SAXPYKERNEL = ../arm/axpy.c -DAXPYKERNEL = ../arm/axpy.c -CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S SCOPYKERNEL = scopy_vfp.S DCOPYKERNEL = dcopy_vfp.S diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S new file mode 100644 index 000000000..acc575707 --- /dev/null +++ b/kernel/arm/axpy_vfp.S @@ -0,0 +1,503 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + +#if !defined(CONJ) + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fnmacd +#define FMAC_I1 fmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fnmacs +#define FMAC_I1 fmacs +#define FMAC_I2 fmacs + +#endif + +#else // CONJ + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fmacd +#define FMAC_I1 fnmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fmacs +#define FMAC_I1 fnmacs +#define FMAC_I2 fmacs + +#endif + +#endif + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + fmacd d9 , d0, d5 + fstmiad Y!, { d9 } + fmacd d10, d0, d6 + fstmiad Y!, { d10 } + fmacd d11, d0, d7 + fstmiad Y!, { d11 } + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y , { d8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + fmacs s9 , s0, s5 + fstmias Y!, { s9 } + fmacs s10, s0, s6 + fstmias Y!, { s10 } + fmacs s11, s0, s7 + fstmias Y!, { s11 } + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y , { s8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y , { d8 - d9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s7 } + pld [ Y, #X_PRE ] + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y , { s8 - s9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + sub sp, sp, #STACKSIZE // reserve stack + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + sub r12, fp, #128 + +#if defined(DOUBLE) + vstm r12, { d8 - d15} // store floating point registers +#else + vstm r12, { s8 - s15} // store floating point registers +#endif + + cmp N, #0 + ble axpy_kernel_L999 + + cmp INC_X, #0 + beq axpy_kernel_L999 + + cmp INC_Y, #0 + beq axpy_kernel_L999 + + cmp INC_X, #1 + bne axpy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne axpy_kernel_S_BEGIN + + +axpy_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_F1 + + .align 5 + +axpy_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble axpy_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne axpy_kernel_F4 + +axpy_kernel_F1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne axpy_kernel_F10 + + b axpy_kernel_L999 + +axpy_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_S1 + + .align 5 + +axpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S4 + +axpy_kernel_S1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S10 + + +axpy_kernel_L999: + + sub r3, fp, #128 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From 36b0f7fe1dfc89e9d08a8afc508229833a1f11e9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Nov 2013 19:31:27 +0100 Subject: [PATCH 085/127] added optimized gemv_t kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 4 +- kernel/arm/gemv_t_vfp.S | 750 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 752 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_t_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 79fb5f9c9..1ab2683cf 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -75,8 +75,8 @@ DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c -SGEMVTKERNEL = gemv_t.c -DGEMVTKERNEL = gemv_t.c +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S new file mode 100644 index 000000000..6a56ae9d1 --- /dev/null +++ b/kernel/arm/gemv_t_vfp.S @@ -0,0 +1,750 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/25 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F2 + + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + +.endm + +.macro KERNEL_F2X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d12 - d15 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d4 - d5 } + fldmiad AO1!, { d10 - d11 } + fldmiad AO2!, { d6 - d7 } + + vmla.f64 d2 , d12 , d8 + vmla.f64 d3 , d12 , d4 + vmla.f64 d2 , d13 , d9 + vmla.f64 d3 , d13 , d5 + vmla.f64 d2 , d14, d10 + vmla.f64 d3 , d14, d6 + vmla.f64 d2 , d15, d11 + vmla.f64 d3 , d15, d7 + +.endm + +.macro KERNEL_F2X1 + + fldmiad XO! , { d1 } + fldmiad AO1!, { d8 } + fldmiad AO2!, { d4 } + vmla.f64 d2 , d1 , d8 + vmla.f64 d3 , d1 , d4 + +.endm + +.macro SAVE_F2 + + fldmiad YO, { d4 - d5 } + vmla.f64 d4, d0, d2 + vmla.f64 d5, d0, d3 + fstmiad YO!, { d4 - d5 } + +.endm + +.macro INIT_F1 + + vsub.f64 d2 , d2 , d2 + +.endm + +.macro KERNEL_F1X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d12 - d15 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + fldmiad AO1!, { d10 - d11 } + vmla.f64 d2 , d12 , d8 + vmla.f64 d2 , d13 , d9 + vmla.f64 d2 , d14, d10 + vmla.f64 d2 , d15, d11 + +.endm + +.macro KERNEL_F1X1 + + fldmiad XO! , { d1 } + fldmiad AO1!, { d8 } + vmla.f64 d2 , d1 , d8 + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d2 + fstmiad YO!, { d4 } + +.endm + + +.macro INIT_S2 + + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + +.endm + +.macro KERNEL_S2X4 + + fldmiad XO , { d12 } + add XO, XO, INC_X + + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d4 - d5 } + + fldmiad XO , { d13 } + add XO, XO, INC_X + fldmiad AO1!, { d10 - d11 } + fldmiad AO2!, { d6 - d7 } + + fldmiad XO , { d14 } + add XO, XO, INC_X + + fldmiad XO , { d15 } + add XO, XO, INC_X + + vmla.f64 d2 , d12 , d8 + vmla.f64 d3 , d12 , d4 + vmla.f64 d2 , d13 , d9 + vmla.f64 d3 , d13 , d5 + vmla.f64 d2 , d14, d10 + vmla.f64 d3 , d14, d6 + vmla.f64 d2 , d15, d11 + vmla.f64 d3 , d15, d7 + +.endm + +.macro KERNEL_S2X1 + + fldmiad XO , { d1 } + fldmiad AO1!, { d8 } + fldmiad AO2!, { d4 } + vmla.f64 d2 , d1 , d8 + add XO, XO, INC_X + vmla.f64 d3 , d1 , d4 + +.endm + +.macro SAVE_S2 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d2 + fstmiad YO, { d4 } + add YO, YO, INC_Y + + fldmiad YO, { d5 } + vmla.f64 d5, d0, d3 + fstmiad YO, { d5 } + add YO, YO, INC_Y + +.endm + +.macro INIT_S1 + + vsub.f64 d2 , d2 , d2 + +.endm + +.macro KERNEL_S1X4 + + fldmiad XO , { d12 } + add XO, XO, INC_X + + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + + fldmiad XO , { d13 } + add XO, XO, INC_X + fldmiad AO1!, { d10 - d11 } + + fldmiad XO , { d14 } + add XO, XO, INC_X + + fldmiad XO , { d15 } + add XO, XO, INC_X + + vmla.f64 d2 , d12 , d8 + vmla.f64 d2 , d13 , d9 + vmla.f64 d2 , d14, d10 + vmla.f64 d2 , d15, d11 + +.endm + +.macro KERNEL_S1X1 + + fldmiad XO , { d1 } + fldmiad AO1!, { d8 } + vmla.f64 d2 , d1 , d8 + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d2 + fstmiad YO, { d4 } + add YO, YO, INC_Y + +.endm + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F2 + + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + +.endm + +.macro KERNEL_F2X4 + + fldmias XO! , { s12 - s15 } + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s4 - s5 } + fldmias AO1!, { s10 - s11 } + fldmias AO2!, { s6 - s7 } + + vmla.f32 s2 , s12 , s8 + vmla.f32 s3 , s12 , s4 + vmla.f32 s2 , s13 , s9 + vmla.f32 s3 , s13 , s5 + vmla.f32 s2 , s14, s10 + vmla.f32 s3 , s14, s6 + vmla.f32 s2 , s15, s11 + vmla.f32 s3 , s15, s7 + +.endm + +.macro KERNEL_F2X1 + + fldmias XO! , { s1 } + fldmias AO1!, { s8 } + fldmias AO2!, { s4 } + vmla.f32 s2 , s1 , s8 + vmla.f32 s3 , s1 , s4 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s4 - s5 } + vmla.f32 s4, s0, s2 + vmla.f32 s5, s0, s3 + fstmias YO!, { s4 - s5 } + +.endm + +.macro INIT_F1 + + vsub.f32 s2 , s2 , s2 + +.endm + +.macro KERNEL_F1X4 + + fldmias XO! , { s12 - s15 } + fldmias AO1!, { s8 - s9 } + fldmias AO1!, { s10 - s11 } + vmla.f32 s2 , s12 , s8 + vmla.f32 s2 , s13 , s9 + vmla.f32 s2 , s14, s10 + vmla.f32 s2 , s15, s11 + +.endm + +.macro KERNEL_F1X1 + + fldmias XO! , { s1 } + fldmias AO1!, { s8 } + vmla.f32 s2 , s1 , s8 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s2 + fstmias YO!, { s4 } + +.endm + + +.macro INIT_S2 + + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + +.endm + +.macro KERNEL_S2X4 + + fldmias XO , { s12 } + add XO, XO, INC_X + + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s4 - s5 } + + fldmias XO , { s13 } + add XO, XO, INC_X + fldmias AO1!, { s10 - s11 } + fldmias AO2!, { s6 - s7 } + + fldmias XO , { s14 } + add XO, XO, INC_X + + fldmias XO , { s15 } + add XO, XO, INC_X + + vmla.f32 s2 , s12 , s8 + vmla.f32 s3 , s12 , s4 + vmla.f32 s2 , s13 , s9 + vmla.f32 s3 , s13 , s5 + vmla.f32 s2 , s14, s10 + vmla.f32 s3 , s14, s6 + vmla.f32 s2 , s15, s11 + vmla.f32 s3 , s15, s7 + +.endm + +.macro KERNEL_S2X1 + + fldmias XO , { s1 } + fldmias AO1!, { s8 } + fldmias AO2!, { s4 } + vmla.f32 s2 , s1 , s8 + add XO, XO, INC_X + vmla.f32 s3 , s1 , s4 + +.endm + +.macro SAVE_S2 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s2 + fstmias YO, { s4 } + add YO, YO, INC_Y + + fldmias YO, { s5 } + vmla.f32 s5, s0, s3 + fstmias YO, { s5 } + add YO, YO, INC_Y + +.endm + +.macro INIT_S1 + + vsub.f32 s2 , s2 , s2 + +.endm + +.macro KERNEL_S1X4 + + fldmias XO , { s12 } + add XO, XO, INC_X + + pld [ AO1 , #A_PRE ] + fldmias AO1!, { s8 - s9 } + + fldmias XO , { s13 } + add XO, XO, INC_X + fldmias AO1!, { s10 - s11 } + + fldmias XO , { s14 } + add XO, XO, INC_X + + fldmias XO , { s15 } + add XO, XO, INC_X + + vmla.f32 s2 , s12 , s8 + vmla.f32 s2 , s13 , s9 + vmla.f32 s2 , s14, s10 + vmla.f32 s2 , s15, s11 + +.endm + +.macro KERNEL_S1X1 + + fldmias XO , { s1 } + fldmias AO1!, { s8 } + vmla.f32 s2 , s1 , s8 + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s2 + fstmias YO, { s4 } + add YO, YO, INC_Y + +.endm + + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble gemvt_kernel_L999 + + cmp OLD_N, #0 + ble gemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvt_kernel_L999 + + cmp INC_Y, #0 + beq gemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne gemvt_kernel_S2_BEGIN + + +gemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_F1_BEGIN + +gemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F2X1 + + +gemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne gemvt_kernel_F2X4_10 + + +gemvt_kernel_F2X1: + + ands I, M , #3 + ble gemvt_kernel_F2_END + +gemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne gemvt_kernel_F2X1_10 + + +gemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne gemvt_kernel_F2X4 + + +gemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F1X1 + + +gemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne gemvt_kernel_F1X4_10 + + +gemvt_kernel_F1X1: + + ands I, M , #3 + ble gemvt_kernel_F1_END + +gemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne gemvt_kernel_F1X1_10 + + +gemvt_kernel_F1_END: + + SAVE_F1 + + b gemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_S1_BEGIN + +gemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S2X1 + + +gemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne gemvt_kernel_S2X4_10 + + +gemvt_kernel_S2X1: + + ands I, M , #3 + ble gemvt_kernel_S2_END + +gemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne gemvt_kernel_S2X1_10 + + +gemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne gemvt_kernel_S2X4 + + +gemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S1X1 + + +gemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne gemvt_kernel_S1X4_10 + + +gemvt_kernel_S1X1: + + ands I, M , #3 + ble gemvt_kernel_S1_END + +gemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne gemvt_kernel_S1X1_10 + + +gemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +gemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From 697e198e8a530be338b7114d2e24d2d1bb7e3392 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Nov 2013 16:15:06 +0100 Subject: [PATCH 086/127] added zgemm_kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/zgemm_kernel_2x2_vfp.S | 1311 +++++++++++++++++++++++++++++ 2 files changed, 1312 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1ab2683cf..58e4d2702 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -111,7 +111,7 @@ CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S ZGEMMONCOPY = zgemm_ncopy_2_vfp.S ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..7f7664981 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -0,0 +1,1311 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/11/02 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* ZGEMM_P 64 +* ZGEMM_Q 120 +* ZGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS +* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS +* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS +* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + fldmiad CO2, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + fldmiad CO2, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble zgemm_kernel_L1_BEGIN + +zgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +zgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_30: + tst L, #3 + ble zgemm_kernel_L2_M2_40 + + tst L, #2 + ble zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_40: + + INIT2x2 + + +zgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L2_M2_20 + + +zgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt zgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble zgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +zgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt zgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_30: + tst L, #3 + ble zgemm_kernel_L1_M2_40 + + tst L, #2 + ble zgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + +zgemm_kernel_L1_M2_32: + + tst L, #1 + ble zgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_40: + + INIT2x1 + + +zgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne zgemm_kernel_L1_M2_46 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L1_M2_20 + + +zgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From a9bd12da2c4e38449a10ea00918849e1a9c76b4f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Nov 2013 17:37:38 +0100 Subject: [PATCH 087/127] optimized dgemm kernel for ARMV6 --- kernel/arm/dgemm_kernel_4x2_vfp.S | 43 +++++++++++++++++++------------ 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 56fd81513..55409a5ef 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/27 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define A_PRE 96 #define B_PRE 96 -#define C_PRE 64 +#define C_PRE 32 /************************************************************************************** * Macro definitions @@ -100,26 +100,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB + pld [ AO, #A_PRE ] fldd d4 , [ BO ] - fldd d5 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] fmacd d11 , d3, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 + add AO , AO, #32 fmacd d14 , d2, d5 + add BO , BO, #16 fmacd d15 , d3, d5 - add AO , AO, #32 - add BO , BO, #16 .endm @@ -130,37 +131,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA + fldd d4 , [CO1] fldd d5 , [CO1, #8 ] - fldd d6 , [CO1, #16 ] - fldd d7 , [CO1, #24 ] + pld [ CO1, #C_PRE ] fmacd d4 , d0 , d8 + fldd d6 , [CO1, #16 ] fmacd d5 , d0 , d9 + fldd d7 , [CO1, #24 ] fmacd d6 , d0 , d10 + fstd d4 , [CO1] fmacd d7 , d0 , d11 - fstd d4 , [CO1] fstd d5 , [CO1, #8 ] fstd d6 , [CO1, #16 ] fstd d7 , [CO1, #24 ] fldd d4 , [CO2] fldd d5 , [CO2, #8 ] - fldd d6 , [CO2, #16 ] - fldd d7 , [CO2, #24 ] + pld [ CO2, #C_PRE ] fmacd d4 , d0 , d12 + fldd d6 , [CO2, #16 ] fmacd d5 , d0 , d13 + fldd d7 , [CO2, #24 ] fmacd d6 , d0 , d14 + fstd d4 , [CO2] fmacd d7 , d0 , d15 + add CO1, CO1, #32 - fstd d4 , [CO2] fstd d5 , [CO2, #8 ] fstd d6 , [CO2, #16 ] fstd d7 , [CO2, #24 ] - add CO1, CO1, #32 .endm @@ -469,13 +473,18 @@ dgemm_kernel_L2_M4_20: .align 5 dgemm_kernel_L2_M4_22: + + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From 3d5e792c72cd6d9894c4583c527e058481816657 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Nov 2013 18:38:32 +0100 Subject: [PATCH 088/127] optimized sgemm kernel for ARMV6 --- kernel/arm/sgemm_kernel_4x2_vfp.S | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index 3e20f86f0..e074e744c 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar +* 2013/11/27 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -101,16 +101,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB flds s4 , [ BO ] - flds s5 , [ BO, #4 ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] - flds s2 , [ AO, #8 ] - flds s3 , [ AO, #12 ] fmacs s8 , s0, s4 + flds s2 , [ AO, #8 ] fmacs s9 , s1, s4 + flds s3 , [ AO, #12 ] fmacs s10 , s2, s4 + flds s5 , [ BO, #4 ] fmacs s11 , s3, s4 fmacs s12 , s0, s5 @@ -469,13 +469,20 @@ sgemm_kernel_L2_M4_20: .align 5 sgemm_kernel_L2_M4_22: + + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO, #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO, #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From b42145834f2f9d6eb3dab4a206d48c9a1db2bbe4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 08:08:08 +0100 Subject: [PATCH 089/127] optimized sgemm kernel for ARMV6 --- kernel/arm/sgemm_kernel_4x2_vfp.S | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index e074e744c..0e2061d77 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/27 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -100,17 +100,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - flds s4 , [ BO ] - - flds s0 , [ AO ] - flds s1 , [ AO, #4 ] + fldmias AO! , { s0 - s3 } + fldmias BO! , { s4 - s5 } fmacs s8 , s0, s4 - flds s2 , [ AO, #8 ] fmacs s9 , s1, s4 - flds s3 , [ AO, #12 ] fmacs s10 , s2, s4 - flds s5 , [ BO, #4 ] fmacs s11 , s3, s4 fmacs s12 , s0, s5 @@ -118,9 +113,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s14 , s2, s5 fmacs s15 , s3, s5 - add AO , AO, #16 - add BO , BO, #8 - .endm .macro SAVE4x2 From a537d7d8d7a5936a66436ec1f184a43fb39507b5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 08:33:44 +0100 Subject: [PATCH 090/127] optimized zgemm_kernel_2x2_vfp.S --- kernel/arm/zgemm_kernel_2x2_vfp.S | 54 +++++++++++++------------------ 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 7f7664981..ad6b56ac0 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -26,28 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/05 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/11/02 Saar -* UNROLL_N 2 -* UNROLL_M 2 -* ZGEMM_P 64 -* ZGEMM_Q 120 -* ZGEMM_R 4096 -* A_PRE 96 -* B_PRE 96 -* C_PRE 64 -* -* Performance on Odroid U2: -* -* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS -* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS -* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS -* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS -**************************************************************************************/ +***************************************************************************************/ #define ASSEMBLER #include "common.h" @@ -159,6 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] @@ -201,22 +187,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] - fldd d6 , [ BO, #16 ] - fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 + fldd d2 , [ AO, #16 ] fmacd d9 , d0, d5 + fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 + fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 @@ -228,32 +217,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d14 , d2, d6 KMAC_R d14 , d3, d7 + add BO , BO, #32 fmacd d15 , d2, d7 + add AO , AO, #32 KMAC_I d15 , d3, d6 - add BO , BO, #32 - add AO , AO, #32 .endm .macro KERNEL2x2_M2 + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] - fldd d6 , [ BO, #16 ] - fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 + fldd d2 , [ AO, #16 ] fmacd d9 , d0, d5 + fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 + fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 @@ -265,12 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d14 , d2, d6 KMAC_R d14 , d3, d7 - fmacd d15 , d2, d7 - KMAC_I d15 , d3, d6 - add BO , BO, #32 + fmacd d15 , d2, d7 add AO , AO, #32 - + KMAC_I d15 , d3, d6 .endm From 5007a534c48c9c966c3bfd3de6b2cec1a696ccbf Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 10:04:43 +0100 Subject: [PATCH 091/127] optimized zgemm kernel for ARMV6 --- kernel/arm/zgemm_kernel_2x2_vfp.S | 64 +++++++++++++++---------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index ad6b56ac0..8a5401858 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -187,38 +187,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 - pld [ AO, #A_PRE ] - pld [ BO, #B_PRE ] fldd d0 , [ AO ] - fldd d1 , [ AO, #8 ] - fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 - KMAC_R d8 , d1, d5 - fldd d2 , [ AO, #16 ] + fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] - KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 + pld [ BO, #B_PRE ] fmacd d12 , d0, d6 - KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 - KMAC_R d14 , d3, d7 - add BO , BO, #32 fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 @@ -227,41 +226,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - pld [ AO, #A_PRE ] - pld [ BO, #B_PRE ] - fldd d0 , [ AO ] - fldd d1 , [ AO, #8 ] - fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 - KMAC_R d8 , d1, d5 - fldd d2 , [ AO, #16 ] + fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] - KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 + pld [ BO, #B_PRE ] fmacd d12 , d0, d6 - KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 - KMAC_R d14 , d3, d7 - add BO , BO, #32 fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 + .endm @@ -305,37 +303,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB fldd d0 , [ AO ] - fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] - fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] - fldd d6 , [ BO, #16 ] - fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 - KMAC_R d8 , d1, d5 + fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 + fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 - KMAC_R d10 , d3, d5 + fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 + pld [ BO, #B_PRE ] fmacd d12 , d0, d6 - KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 - KMAC_R d14 , d3, d7 fmacd d15 , d2, d7 - KMAC_I d15 , d3, d6 - add BO , BO, #32 + KMAC_R d14 , d3, d7 add AO , AO, #32 + KMAC_I d15 , d3, d6 .endm From 274304bd032990ba57d3b2e375e7bf916f1a1fe2 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 11:54:38 +0100 Subject: [PATCH 092/127] add optimized cgemm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/cgemm_kernel_2x2_vfp.S | 1252 +++++++++++++++++++++++++++++ 2 files changed, 1253 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/cgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 58e4d2702..3d3847f13 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -105,7 +105,7 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..75fbf097b --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -0,0 +1,1252 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + fldmias CO2, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + fldmias CO2, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble cgemm_kernel_L1_BEGIN + +cgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +cgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_30: + tst L, #3 + ble cgemm_kernel_L2_M2_40 + + tst L, #2 + ble cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_40: + + INIT2x2 + + +cgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L2_M2_20 + + +cgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt cgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble cgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +cgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt cgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_30: + tst L, #3 + ble cgemm_kernel_L1_M2_40 + + tst L, #2 + ble cgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + +cgemm_kernel_L1_M2_32: + + tst L, #1 + ble cgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_40: + + INIT2x1 + + +cgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne cgemm_kernel_L1_M2_46 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L1_M2_20 + + +cgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From dec7ad0dfd2738ac708aae408d16e280ee06b5cc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 12:32:12 +0100 Subject: [PATCH 093/127] optimized dtrmm kernel for ARMV7 --- kernel/arm/dtrmm_kernel_4x2_vfp.S | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index 55a017a97..762b9c580 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -106,25 +106,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB fldd d4 , [ BO ] - fldd d5 , [ BO, #8 ] - fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] + pld [ AO , #A_PRE ] fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] fmacd d11 , d3, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 + add AO , AO, #32 fmacd d14 , d2, d5 + add BO , BO, #16 fmacd d15 , d3, d5 - add AO , AO, #32 - add BO , BO, #16 .endm @@ -490,13 +491,18 @@ _L2_M4_20: .align 5 _L2_M4_22: + + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From 5bc322a66c1058eb1d600511ee7ee36e40633172 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 12:45:38 +0100 Subject: [PATCH 094/127] optimized strmm kernel for ARMV6 --- kernel/arm/strmm_kernel_4x2_vfp.S | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 5394a6444..ab5ff7fa2 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -105,13 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - flds s4 , [ BO ] - flds s5 , [ BO, #4 ] - - flds s0 , [ AO ] - flds s1 , [ AO, #4 ] - flds s2 , [ AO, #8 ] - flds s3 , [ AO, #12 ] + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 @@ -123,9 +118,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s14 , s2, s5 fmacs s15 , s3, s5 - add AO , AO, #16 - add BO , BO, #8 - .endm .macro SAVE4x2 @@ -490,13 +482,19 @@ _L2_M4_20: .align 5 _L2_M4_22: + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO , #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO , #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From 42a4dff0568d679c942cac8722cf01185ae97c21 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 13:41:06 +0100 Subject: [PATCH 095/127] added optimized ztrmm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/ztrmm_kernel_2x2_vfp.S | 1537 +++++++++++++++++++++++++++++ 2 files changed, 1538 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/ztrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 3d3847f13..9ee6070fd 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -83,7 +83,7 @@ ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S SGEMMINCOPY = sgemm_ncopy_4_vfp.S diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..59039c32f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -0,0 +1,1537 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 86afb47e8365f012f1c0fbd8fa6c1a639aa7a8fb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 14:35:07 +0100 Subject: [PATCH 096/127] added optimized ctrmm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/ctrmm_kernel_2x2_vfp.S | 1455 +++++++++++++++++++++++++++++ 2 files changed, 1456 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/ctrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 9ee6070fd..e79ea65d1 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -82,7 +82,7 @@ ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..a68434f97 --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -0,0 +1,1455 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From d54a06171351f545cebca73737a102c88cb1ff74 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 17:40:21 +0100 Subject: [PATCH 097/127] optimized gemv_n_vfp.S --- kernel/arm/gemv_n_vfp.S | 157 ++++++++++++++++++++++++++++------------ 1 file changed, 111 insertions(+), 46 deletions(-) diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 47265994c..f1cf9a05e 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/24 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -74,44 +74,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -.macro INIT_F4 +.macro INIT_F8 pld [ YO , #Y_PRE ] + pld [ YO , #Y_PRE+32 ] - vsub.f64 d12 , d12 , d12 - vmov.f64 d13 , d12 - vmov.f64 d14 , d12 - vmov.f64 d15 , d12 + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10 , d8 + vmov.f64 d11 , d8 + vmov.f64 d12 , d8 + vmov.f64 d13 , d8 + vmov.f64 d14 , d8 + vmov.f64 d15 , d8 .endm -.macro KERNEL_F4X4 +.macro KERNEL_F8X8 pld [ XO , #X_PRE ] - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 .endm -.macro KERNEL_F4X1 +.macro KERNEL_F8X1 + pld [ AO2 , #A_PRE ] fldmiad XO! , { d2 } - fldmiad AO1 , { d8 - d11 } + fldmiad AO1 , { d4 - d7 } - vmla.f64 d12 , d2 , d8 - pld [ AO2 , #A_PRE ] - vmla.f64 d13 , d2 , d9 + vmla.f64 d8 , d2 , d4 + pld [ AO2 , #4*SIZE ] + vmla.f64 d9 , d2 , d5 + add r3, AO1, #4*SIZE + vmla.f64 d10 , d2 , d6 + vmla.f64 d11 , d2 , d7 + + + fldmiad r3 , { d4 - d7 } + + vmla.f64 d12 , d2 , d4 + vmla.f64 d13 , d2 , d5 add AO1, AO1, LDA - vmla.f64 d14 , d2 , d10 - vmla.f64 d15 , d2 , d11 + vmla.f64 d14 , d2 , d6 add AO2, AO2, LDA + vmla.f64 d15 , d2 , d7 + .endm -.macro SAVE_F4 +.macro SAVE_F8 + + fldmiad YO, { d4 - d7 } + + vmla.f64 d4 , d0, d8 + vmla.f64 d5 , d0, d9 + vmla.f64 d6 , d0, d10 + vmla.f64 d7 , d0, d11 + + fstmiad YO!, { d4 - d7 } fldmiad YO, { d4 - d7 } @@ -244,43 +275,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /************************* SINGLE PRECISION *****************************************/ -.macro INIT_F4 +.macro INIT_F8 pld [ YO , #Y_PRE ] - vsub.f32 s12 , s12 , s12 - vmov.f32 s13 , s12 - vmov.f32 s14 , s12 - vmov.f32 s15 , s12 + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10 , s8 + vmov.f32 s11 , s8 + vmov.f32 s12 , s8 + vmov.f32 s13 , s8 + vmov.f32 s14 , s8 + vmov.f32 s15 , s8 .endm -.macro KERNEL_F4X4 +.macro KERNEL_F8X8 pld [ XO , #X_PRE ] - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 .endm -.macro KERNEL_F4X1 +.macro KERNEL_F8X1 + pld [ AO2, #A_PRE ] fldmias XO! , { s2 } - fldmias AO1 , { s8 - s11 } + fldmias AO1 , { s4 - s7 } + + vmla.f32 s8 , s2 , s4 + vmla.f32 s9 , s2 , s5 + vmla.f32 s10 , s2 , s6 + vmla.f32 s11 , s2 , s7 + + add r3, AO1, #4*SIZE + + fldmias r3 , { s4 - s7 } + + vmla.f32 s12 , s2 , s4 + vmla.f32 s13 , s2 , s5 + vmla.f32 s14 , s2 , s6 + vmla.f32 s15 , s2 , s7 - vmla.f32 s12 , s2 , s8 - vmla.f32 s13 , s2 , s9 - vmla.f32 s14 , s2 , s10 - vmla.f32 s15 , s2 , s11 add AO1, AO1, LDA add AO2, AO2, LDA .endm -.macro SAVE_F4 +.macro SAVE_F8 + + fldmias YO, { s4 - s7 } + + vmla.f32 s4 , s0, s8 + vmla.f32 s5 , s0, s9 + vmla.f32 s6 , s0, s10 + vmla.f32 s7 , s0, s11 + + fstmias YO!, { s4 - s7 } + fldmias YO, { s4 - s7 } @@ -332,8 +393,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X4 + pld [ AO2 , #A_PRE ] KERNEL_S4X1 KERNEL_S4X1 + pld [ AO2 , #A_PRE ] KERNEL_S4X1 KERNEL_S4X1 @@ -342,7 +405,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 - pld [ AO2 , #A_PRE ] fldmias XO , { s2 } fldmias AO1 , { s8 - s11 } @@ -471,27 +533,30 @@ gemvn_kernel_F4_BEGIN: ldr YO , Y ldr I, M - asrs I, I, #2 // I = M / 4 + asrs I, I, #3 // I = M / 8 ble gemvn_kernel_F1_BEGIN gemvn_kernel_F4X4: ldr AO1, A add AO2, AO1, LDA - add r3 , AO1, #4*SIZE + add r3 , AO1, #8*SIZE str r3 , A + add AO2, AO2, LDA + add AO2, AO2, LDA + ldr XO , X - INIT_F4 + INIT_F8 - asrs J, N, #2 // J = N / 4 + asrs J, N, #3 // J = N / 8 ble gemvn_kernel_F4X1 gemvn_kernel_F4X4_10: - KERNEL_F4X4 + KERNEL_F8X8 subs J, J, #1 bne gemvn_kernel_F4X4_10 @@ -499,12 +564,12 @@ gemvn_kernel_F4X4_10: gemvn_kernel_F4X1: - ands J, N , #3 + ands J, N , #7 ble gemvn_kernel_F4_END gemvn_kernel_F4X1_10: - KERNEL_F4X1 + KERNEL_F8X1 subs J, J, #1 bne gemvn_kernel_F4X1_10 @@ -512,7 +577,7 @@ gemvn_kernel_F4X1_10: gemvn_kernel_F4_END: - SAVE_F4 + SAVE_F8 subs I , I , #1 bne gemvn_kernel_F4X4 @@ -521,7 +586,7 @@ gemvn_kernel_F4_END: gemvn_kernel_F1_BEGIN: ldr I, M - ands I, I , #3 + ands I, I , #7 ble gemvn_kernel_L999 gemvn_kernel_F1X1: From 2d3c88429403f3302531352ce293583f37871bfc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 29 Nov 2013 17:06:33 +0100 Subject: [PATCH 098/127] added complex gemv kernels for ARMV6 and ARMV7 --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/KERNEL.ARMV7 | 18 +- kernel/arm/cgemv_n_vfp.S | 697 ++++++++++++++++++++++++++++++++++++++ kernel/arm/cgemv_t_vfp.S | 607 +++++++++++++++++++++++++++++++++ kernel/arm/zgemv_n_vfp.S | 699 +++++++++++++++++++++++++++++++++++++++ kernel/arm/zgemv_t_vfp.S | 608 ++++++++++++++++++++++++++++++++++ 6 files changed, 2624 insertions(+), 13 deletions(-) create mode 100644 kernel/arm/cgemv_n_vfp.S create mode 100644 kernel/arm/cgemv_t_vfp.S create mode 100644 kernel/arm/zgemv_n_vfp.S create mode 100644 kernel/arm/zgemv_t_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e79ea65d1..f47a843f3 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -72,13 +72,13 @@ ZSWAPKERNEL = swap_vfp.S SGEMVNKERNEL = gemv_n_vfp.S DGEMVNKERNEL = gemv_n_vfp.S -CGEMVNKERNEL = zgemv_n.c -ZGEMVNKERNEL = zgemv_n.c +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S SGEMVTKERNEL = gemv_t_vfp.S DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = zgemv_t.c -ZGEMVTKERNEL = zgemv_t.c +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 143327074..507f9813c 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -70,15 +70,15 @@ DSCALKERNEL = scal_vfp.S CSCALKERNEL = scal_vfp.S ZSCALKERNEL = scal_vfp.S -SGEMVNKERNEL = gemv_n_vfpv3.S -DGEMVNKERNEL = gemv_n_vfpv3.S -CGEMVNKERNEL = zgemv_n.c -ZGEMVNKERNEL = zgemv_n.c - -SGEMVTKERNEL = gemv_t_vfpv3.S -DGEMVTKERNEL = gemv_t_vfpv3.S -CGEMVTKERNEL = zgemv_t.c -ZGEMVTKERNEL = zgemv_t.c +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S new file mode 100644 index 000000000..522c4c764 --- /dev/null +++ b/kernel/arm/cgemv_n_vfp.S @@ -0,0 +1,697 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define ALPHA_I [fp, #-236] +#define ALPHA_R [fp, #-244] + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + +.macro INIT_F4 + + pld [ YO, #Y_PRE ] + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + +.macro KERNEL_F4X1 + + pld [ AO2, #A_PRE ] + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_F4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO!, { s4 - s7 } + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + + + + +.macro INIT_F1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_F1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_F1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, #8 + +.endm + +/****************************************************************************************/ + +.macro INIT_S4 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + +.macro KERNEL_S4X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_S4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + + + + +.macro INIT_S1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_S1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_S1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble cgemvn_kernel_L999 + + cmp N, #0 + ble cgemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + vstr s0 , ALPHA_R + vstr s1 , ALPHA_I + + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvn_kernel_L999 + + cmp INC_Y, #0 + beq cgemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE * 2 +#else + lsl LDA, LDA, #3 // LDA * SIZE * 2 +#endif + + cmp INC_X, #1 + bne cgemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne cgemvn_kernel_S4_BEGIN + + +cgemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_F1_BEGIN + +cgemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + add AO2, AO2, LDA + add AO2, AO2, LDA + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_F4X1 + + +cgemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne cgemvn_kernel_F4X4_10 + + +cgemvn_kernel_F4X1: + + ands J, N , #3 + ble cgemvn_kernel_F4_END + +cgemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne cgemvn_kernel_F4X1_10 + + +cgemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne cgemvn_kernel_F4X4 + + +cgemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +cgemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne cgemvn_kernel_F1X1_10 + + +cgemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne cgemvn_kernel_F1X1 + + b cgemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_S1_BEGIN + +cgemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_S4X1 + + +cgemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne cgemvn_kernel_S4X4_10 + + +cgemvn_kernel_S4X1: + + ands J, N , #3 + ble cgemvn_kernel_S4_END + +cgemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne cgemvn_kernel_S4X1_10 + + +cgemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne cgemvn_kernel_S4X4 + + +cgemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +cgemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne cgemvn_kernel_S1X1_10 + + +cgemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne cgemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +cgemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S new file mode 100644 index 000000000..52276a06f --- /dev/null +++ b/kernel/arm/cgemv_t_vfp.S @@ -0,0 +1,607 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +.macro INIT_F2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_F2X4 + + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + +.endm + +.macro KERNEL_F2X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + +/************************************************************************************************/ + +.macro INIT_F1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_F1X4 + + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + +.endm + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO!, { s4 - s5 } + +.endm + +/************************************************************************************************/ + +.macro INIT_S2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_S2X4 + + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + +.endm + +.macro KERNEL_S2X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S2 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + +/************************************************************************************************/ + +.macro INIT_S1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_S1X4 + + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + +.endm + +.macro KERNEL_S1X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble cgemvt_kernel_L999 + + cmp OLD_N, #0 + ble cgemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvt_kernel_L999 + + cmp INC_Y, #0 + beq cgemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE +#else + lsl LDA, LDA, #3 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne cgemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne cgemvt_kernel_S2_BEGIN + + +cgemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_F1_BEGIN + +cgemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F2X1 + + +cgemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne cgemvt_kernel_F2X4_10 + + +cgemvt_kernel_F2X1: + + ands I, M , #3 + ble cgemvt_kernel_F2_END + +cgemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne cgemvt_kernel_F2X1_10 + + +cgemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne cgemvt_kernel_F2X4 + + +cgemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F1X1 + + +cgemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne cgemvt_kernel_F1X4_10 + + +cgemvt_kernel_F1X1: + + ands I, M , #3 + ble cgemvt_kernel_F1_END + +cgemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne cgemvt_kernel_F1X1_10 + + +cgemvt_kernel_F1_END: + + SAVE_F1 + + b cgemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_S1_BEGIN + +cgemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S2X1 + + +cgemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne cgemvt_kernel_S2X4_10 + + +cgemvt_kernel_S2X1: + + ands I, M , #3 + ble cgemvt_kernel_S2_END + +cgemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne cgemvt_kernel_S2X1_10 + + +cgemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne cgemvt_kernel_S2X4 + + +cgemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S1X1 + + +cgemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne cgemvt_kernel_S1X4_10 + + +cgemvt_kernel_S1X1: + + ands I, M , #3 + ble cgemvt_kernel_S1_END + +cgemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne cgemvt_kernel_S1X1_10 + + +cgemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S new file mode 100644 index 000000000..3b51d5553 --- /dev/null +++ b/kernel/arm/zgemv_n_vfp.S @@ -0,0 +1,699 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define ALPHA_I [fp, #-236] +#define ALPHA_R [fp, #-244] + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + +.macro INIT_F4 + + pld [ YO, #Y_PRE ] + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + +.macro KERNEL_F4X1 + + fldd d0 , [ AO1 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + pld [ AO2, #A_PRE ] + + fldd d1 , [ AO1, #8 ] + fmacd d8 , d0, d4 + fldd d2 , [ AO1, #16 ] + fmacd d9 , d0, d5 + fldd d3 , [ AO1, #24 ] + fmacd d10 , d2, d4 + fldd d0 , [ AO1, #32 ] + fmacd d11 , d2, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + KMAC_R d10 , d3, d5 + fldd d1 , [ AO1, #40 ] + KMAC_I d11 , d3, d4 + + fldd d2 , [ AO1, #48 ] + + fmacd d12 , d0, d4 + fldd d3 , [ AO1, #56 ] + fmacd d13 , d0, d5 + pld [ AO2, #A_PRE+32 ] + fmacd d14 , d2, d4 + fmacd d15 , d2, d5 + + KMAC_R d12 , d1, d5 + add XO , XO, #16 + KMAC_I d13 , d1, d4 + add AO1 , AO1, LDA + KMAC_R d14 , d3, d5 + add AO2 , AO2, LDA + KMAC_I d15 , d3, d4 + +.endm + +.macro SAVE_F4 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad YO!, { d4 - d7 } + + fldmiad YO, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO!, { d4 - d7 } + +.endm + + + + +.macro INIT_F1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL_F1X1 + + fldd d0 , [ AO1 ] + fldd d1 , [ AO1, #8 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d0, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + + add XO , XO, #16 + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_F1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad YO, { d4 - d5 } + + add YO, YO, #16 + +.endm + +/****************************************************************************************/ + +.macro INIT_S4 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + +.macro KERNEL_S4X1 + + fldd d0 , [ AO1 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d0, d5 + fmacd d10 , d2, d4 + fmacd d11 , d2, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + KMAC_R d10 , d3, d5 + KMAC_I d11 , d3, d4 + + fldd d0 , [ AO1, #32 ] + fldd d1 , [ AO1, #40 ] + fldd d2 , [ AO1, #48 ] + fldd d3 , [ AO1, #56 ] + + fmacd d12 , d0, d4 + fmacd d13 , d0, d5 + fmacd d14 , d2, d4 + fmacd d15 , d2, d5 + + KMAC_R d12 , d1, d5 + KMAC_I d13 , d1, d4 + KMAC_R d14 , d3, d5 + KMAC_I d15 , d3, d4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_S4 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + + fldmiad YO, { d6 - d7 } + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad YO, { d6 - d7 } + + add YO, YO, INC_Y + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + + fldmiad YO, { d6 - d7 } + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO, { d6 - d7 } + + add YO, YO, INC_Y + +.endm + + + + +.macro INIT_S1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL_S1X1 + + fldd d0 , [ AO1 ] + fldd d1 , [ AO1, #8 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d0, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_S1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble zgemvn_kernel_L999 + + cmp N, #0 + ble zgemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + vstr d0 , ALPHA_R + vstr d1 , ALPHA_I + + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq zgemvn_kernel_L999 + + cmp INC_Y, #0 + beq zgemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE * 2 +#else + lsl LDA, LDA, #3 // LDA * SIZE * 2 +#endif + + cmp INC_X, #1 + bne zgemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne zgemvn_kernel_S4_BEGIN + + +zgemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble zgemvn_kernel_F1_BEGIN + +zgemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #64 + str r3 , A + + add AO2, AO2, LDA + add AO2, AO2, LDA + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble zgemvn_kernel_F4X1 + + +zgemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne zgemvn_kernel_F4X4_10 + + +zgemvn_kernel_F4X1: + + ands J, N , #3 + ble zgemvn_kernel_F4_END + +zgemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne zgemvn_kernel_F4X1_10 + + +zgemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne zgemvn_kernel_F4X4 + + +zgemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble zgemvn_kernel_L999 + +zgemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #16 + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +zgemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne zgemvn_kernel_F1X1_10 + + +zgemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne zgemvn_kernel_F1X1 + + b zgemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +zgemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble zgemvn_kernel_S1_BEGIN + +zgemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #64 + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble zgemvn_kernel_S4X1 + + +zgemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne zgemvn_kernel_S4X4_10 + + +zgemvn_kernel_S4X1: + + ands J, N , #3 + ble zgemvn_kernel_S4_END + +zgemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne zgemvn_kernel_S4X1_10 + + +zgemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne zgemvn_kernel_S4X4 + + +zgemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble zgemvn_kernel_L999 + +zgemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #16 + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +zgemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne zgemvn_kernel_S1X1_10 + + +zgemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne zgemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +zgemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S new file mode 100644 index 000000000..500a3b608 --- /dev/null +++ b/kernel/arm/zgemv_t_vfp.S @@ -0,0 +1,608 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 +#define Y_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +.macro INIT_F2 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL_F2X4 + + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + +.endm + +.macro KERNEL_F2X1 + + fldmiad XO! , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + fldmiad AO2!, { d8 - d9 } + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + + fmacd d14 , d8 , d2 + fmacd d15 , d8 , d3 + KMAC_R d14 , d9 , d3 + KMAC_I d15 , d9 , d2 + +.endm + +.macro SAVE_F2 + + fldmiad YO, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO!, { d4 - d7 } + +.endm + +/************************************************************************************************/ + +.macro INIT_F1 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL_F1X4 + + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + +.endm + +.macro KERNEL_F1X1 + + fldmiad XO! , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO!, { d4 - d5 } + +.endm + +/************************************************************************************************/ + +.macro INIT_S2 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL_S2X4 + + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + +.endm + +.macro KERNEL_S2X1 + + fldmiad XO , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + fldmiad AO2!, { d8 - d9 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + + fmacd d14 , d8 , d2 + fmacd d15 , d8 , d3 + KMAC_R d14 , d9 , d3 + KMAC_I d15 , d9 , d2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S2 + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + + fldmiad YO, { d6 - d7 } + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO, { d6 - d7 } + + add YO, YO, INC_Y + +.endm + +/************************************************************************************************/ + +.macro INIT_S1 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL_S1X4 + + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + +.endm + +.macro KERNEL_S1X1 + + fldmiad XO , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble zgemvt_kernel_L999 + + cmp OLD_N, #0 + ble zgemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq zgemvt_kernel_L999 + + cmp INC_Y, #0 + beq zgemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE +#else + lsl LDA, LDA, #3 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne zgemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne zgemvt_kernel_S2_BEGIN + + +zgemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble zgemvt_kernel_F1_BEGIN + +zgemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_F2X1 + + +zgemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne zgemvt_kernel_F2X4_10 + + +zgemvt_kernel_F2X1: + + ands I, M , #3 + ble zgemvt_kernel_F2_END + +zgemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne zgemvt_kernel_F2X1_10 + + +zgemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne zgemvt_kernel_F2X4 + + +zgemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble zgemvt_kernel_L999 + +zgemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_F1X1 + + +zgemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne zgemvt_kernel_F1X4_10 + + +zgemvt_kernel_F1X1: + + ands I, M , #3 + ble zgemvt_kernel_F1_END + +zgemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne zgemvt_kernel_F1X1_10 + + +zgemvt_kernel_F1_END: + + SAVE_F1 + + b zgemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +zgemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble zgemvt_kernel_S1_BEGIN + +zgemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_S2X1 + + +zgemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne zgemvt_kernel_S2X4_10 + + +zgemvt_kernel_S2X1: + + ands I, M , #3 + ble zgemvt_kernel_S2_END + +zgemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne zgemvt_kernel_S2X1_10 + + +zgemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne zgemvt_kernel_S2X4 + + +zgemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble zgemvt_kernel_L999 + +zgemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_S1X1 + + +zgemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne zgemvt_kernel_S1X4_10 + + +zgemvt_kernel_S1X1: + + ands I, M , #3 + ble zgemvt_kernel_S1_END + +zgemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne zgemvt_kernel_S1X1_10 + + +zgemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +zgemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From 271ceeba1569e7545f03c9500fa443c302c8b17a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 10:58:22 +0100 Subject: [PATCH 099/127] merged form develop --- common_x86_64.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 2da627848..39e5a5eb1 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,20 +372,10 @@ REALNAME: #define PROFCODE #endif -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",@progbits - -#endif - #endif From ac7735e01fe93039285c6f26916f1382209a65e5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 11:05:10 +0100 Subject: [PATCH 100/127] merge from develop --- common_x86.h | 15 +-------------- common_x86_64.h | 10 ---------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/common_x86.h b/common_x86.h index 8245f7078..5f42843be 100644 --- a/common_x86.h +++ b/common_x86.h @@ -301,23 +301,10 @@ REALNAME: #define PROFCODE #endif - -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",%progbits - -#endif - - - #endif #ifdef XDOUBLE diff --git a/common_x86_64.h b/common_x86_64.h index 4fe23448f..39e5a5eb1 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,20 +372,10 @@ REALNAME: #define PROFCODE #endif -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",%progbits - -#endif - #endif From 7581e2e9cbbc91528b6bd2aa1b4d88223a7a4461 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 11:23:36 +0100 Subject: [PATCH 101/127] merged common_x86.h and common_x86_64.h from develop --- common_x86.h | 15 +-------------- common_x86_64.h | 10 ---------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/common_x86.h b/common_x86.h index 8245f7078..5f42843be 100644 --- a/common_x86.h +++ b/common_x86.h @@ -301,23 +301,10 @@ REALNAME: #define PROFCODE #endif - -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",%progbits - -#endif - - - #endif #ifdef XDOUBLE diff --git a/common_x86_64.h b/common_x86_64.h index 4fe23448f..39e5a5eb1 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,20 +372,10 @@ REALNAME: #define PROFCODE #endif -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",%progbits - -#endif - #endif From c947ab85dc5867fdaa2a2c390d4278e59dff909e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:46:30 +0100 Subject: [PATCH 102/127] changed level3.c --- driver/level3/level3.c | 4 +--- driver/level3/level3_thread.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index edba5359e..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 850580504..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; From 04d51536a4c01a4bdb4ff96b2ab8ed0efcbd5ebd Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:49:59 +0100 Subject: [PATCH 103/127] changed level3.c --- driver/level3/level3.c | 4 +--- driver/level3/level3_thread.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; From 39dc69db4a83cb92b5610dc0cc61979f76b4b957 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:51:04 +0100 Subject: [PATCH 104/127] changed level3.c --- driver/level3/level3.c | 4 +--- driver/level3/level3_thread.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; From 8191efc4203584cc02b694c3179ea5c3a1b9237f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:52:08 +0100 Subject: [PATCH 105/127] changed level3.c --- driver/level3/level3.c | 3 +-- driver/level3/level3_thread.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index a44022398..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,13 +333,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 673afcf97..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; From be18cd47f6223cbe3d7fc45fcecec1035dd4d1db Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:52:48 +0100 Subject: [PATCH 106/127] changed level3.c --- driver/level3/level3.c | 26 +------------ driver/level3/level3_thread.c | 70 +---------------------------------- 2 files changed, 3 insertions(+), 93 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index d87c5f546..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -36,8 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -// #define TIMING 1 - /* This file is a template for level 3 operation */ #ifndef BETA_OPERATION @@ -335,24 +333,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#elif defined(ARMV7) - if (min_jj >= 32) min_jj = 32; - else - if (min_jj >= 16) min_jj = 16; - else - if (min_jj >= 8) min_jj = 8; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - #else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -412,22 +400,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; -#ifdef ARMV7 - - printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", - innercost / total * 100., outercost / total * 100., - kernelcost / total * 100.); - - -#else - printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); -#endif #endif return 0; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 56c4d6eca..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -36,8 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -// #define TIMING 1 - #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif @@ -235,21 +233,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG l1stride, l2size; #ifdef TIMING - -#ifdef ARMV7 - - unsigned long long rpcc_counter; - unsigned long long copy_A = 0; - unsigned long long copy_B = 0; - unsigned long long kernel = 0; - unsigned long long waiting1 = 0; - unsigned long long waiting2 = 0; - unsigned long long waiting3 = 0; - unsigned long long waiting6[MAX_CPU_NUMBER]; - unsigned long long ops = 0; - -#else - BLASULONG rpcc_counter; BLASULONG copy_A = 0; BLASULONG copy_B = 0; @@ -260,8 +243,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG ops = 0; -#endif - for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif @@ -339,35 +320,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_l = k - ls; -#ifdef ARMV7_1 - if (min_l >= GEMM_Q / 4 * 2) { - min_l = GEMM_Q / 4; - } else { - if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2; - } - -#else if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } -#endif l1stride = 1; min_i = m_to - m_from; -#ifdef ARMV7_1 - if (min_i >= GEMM_P / 4 * 2) { - min_i = GEMM_P / 4; - } else { - if (min_i > GEMM_P / 4) { - min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); - } else { - if (args -> nthreads == 1) l1stride = 0; - } - } -#else if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -378,8 +339,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } -#endif - START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); @@ -408,22 +367,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#elif defined(ARMV7) - if (min_jj >= 16) min_jj = 16; - else - if (min_jj >= 8) min_jj = 8; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - - #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -555,21 +504,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(waiting3); #ifdef TIMING - -#ifdef ARMV7 - - unsigned long long waiting = waiting1 + waiting2 + waiting3; - unsigned long long total = copy_A + copy_B + kernel + waiting; - - fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", - mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., - (double)waiting1 /(double)total * 100., - (double)waiting2 /(double)total * 100., - (double)waiting3 /(double)total * 100., - (double)kernel /(double)total * 100.); - -#else - BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; @@ -580,8 +514,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); -#endif - #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); From 2a625447ead3e3c89f542fe23a3821b4b04163f4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:11:13 +0100 Subject: [PATCH 107/127] modified common.h --- common.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/common.h b/common.h index 309f246e2..4e3230d7e 100644 --- a/common.h +++ b/common.h @@ -310,6 +310,15 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif @@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips64.h" #endif +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + + #ifdef OS_LINUX #include "common_linux.h" #endif From 9b1b01a478d2ed04aa2f177cc16a0599a4b1616a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:15:51 +0100 Subject: [PATCH 108/127] modified common.h --- common.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/common.h b/common.h index 309f246e2..4e3230d7e 100644 --- a/common.h +++ b/common.h @@ -310,6 +310,15 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif @@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips64.h" #endif +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + + #ifdef OS_LINUX #include "common_linux.h" #endif From 3169f524e417348faa11998cdeec18708e17ccb9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:22:49 +0100 Subject: [PATCH 109/127] modified common.h --- common.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/common.h b/common.h index 309f246e2..4e3230d7e 100644 --- a/common.h +++ b/common.h @@ -310,6 +310,15 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif @@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips64.h" #endif +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + + #ifdef OS_LINUX #include "common_linux.h" #endif From 759421641235481481ad663ee6cab0e5e49f6fed Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:32:54 +0100 Subject: [PATCH 110/127] modified common.h --- common.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/common.h b/common.h index 310fcad93..4e3230d7e 100644 --- a/common.h +++ b/common.h @@ -310,9 +310,12 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif - #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) -#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif @@ -320,8 +323,6 @@ typedef int blasint; #define YIELDING sched_yield() #endif - - /*** To alloc job_t on heap or statck. please https://github.com/xianyi/OpenBLAS/issues/246 @@ -379,6 +380,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_arm64.h" #endif + #ifdef OS_LINUX #include "common_linux.h" #endif @@ -590,9 +592,10 @@ typedef struct { #include "common_level2.h" #include "common_level3.h" #include "common_lapack.h" + #ifdef CBLAS -/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ -#include "cblas_noconst.h" +# define OPENBLAS_CONST /* see comment in cblas.h */ +# include "cblas.h" #endif #ifndef ASSEMBLER From 65ebab068808e90c40203e1ff4ec719c4837f30b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:46:32 +0100 Subject: [PATCH 111/127] modified Makefile.system --- Makefile.system | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Makefile.system b/Makefile.system index 7da074a65..ee6a89046 100644 --- a/Makefile.system +++ b/Makefile.system @@ -373,6 +373,19 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), arm) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + +ifeq ($(ARCH), arm64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + + + + # # C Compiler dependent settings # @@ -833,6 +846,19 @@ ifeq ($(DEBUG), 1) COMMON_OPT += -g endif +ifndef COMMON_OPT +ifeq ($(ARCH), arm) +COMMON_OPT = -O3 +endif +endif + +ifndef COMMON_OPT +ifeq ($(ARCH), arm64) +COMMON_OPT = -O3 +endif +endif + + ifndef COMMON_OPT COMMON_OPT = -O2 endif @@ -958,6 +984,10 @@ export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX +export HAVE_VFP +export HAVE_VFPV3 +export HAVE_VFPV4 +export HAVE_NEON export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE From 51e59835995b71de11e38ec872ed7d2d53ea3fbf Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:48:08 +0100 Subject: [PATCH 112/127] modified Makefile.system --- Makefile.system | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/Makefile.system b/Makefile.system index d6c172f3d..ee6a89046 100644 --- a/Makefile.system +++ b/Makefile.system @@ -82,12 +82,19 @@ ifeq ($(HOSTCC), loongcc) GETARCH_FLAGS += -static endif +#if don't use Fortran, it will only compile CBLAS. +ifeq ($(ONLY_CBLAS), 1) +NO_LAPACK = 1 +else +ONLY_CBLAS = 0 +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -329,16 +336,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE -#BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE -#BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER endif endif @@ -369,17 +374,18 @@ BINARY_DEFINED = 1 endif ifeq ($(ARCH), arm) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 endif ifeq ($(ARCH), arm64) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 endif + # # C Compiler dependent settings # @@ -852,6 +858,7 @@ COMMON_OPT = -O3 endif endif + ifndef COMMON_OPT COMMON_OPT = -O2 endif @@ -921,6 +928,23 @@ LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) LIBS = $(TOPDIR)/$(LIBNAME) LIBS_P = $(TOPDIR)/$(LIBNAME_P) + +LIB_COMPONENTS = BLAS +ifneq ($(NO_CBLAS), 1) +LIB_COMPONENTS += CBLAS +endif + +ifneq ($(NO_LAPACK), 1) +LIB_COMPONENTS += LAPACK +ifneq ($(NO_LAPACKE), 1) +LIB_COMPONENTS += LAPACKE +endif +endif + +ifeq ($(ONLY_CBLAS), 1) +LIB_COMPONENTS = CBLAS +endif + export OSNAME export ARCH export CORE @@ -947,6 +971,7 @@ export USE_OPENMP export CROSS export CROSS_SUFFIX export NOFORTRAN +export NO_FBLAS export EXTRALIB export CEXTRALIB export FEXTRALIB From d13aa79d26918d1fe99349795d0e2b2d29d9c642 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:31:22 +0100 Subject: [PATCH 113/127] modified getarch.c --- getarch.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index 3ffda6244..c502f4a22 100644 --- a/getarch.c +++ b/getarch.c @@ -679,6 +679,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_ARMV7 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV7" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV7 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP" +#define LIBNAME "armv7" +#define CORENAME "ARMV7" +#else +#endif + +#ifdef FORCE_ARMV6 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV6" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV6 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP" +#define LIBNAME "armv6" +#define CORENAME "ARMV6" +#else +#endif + +#ifdef FORCE_ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" +#define LIBNAME "armv8" +#define CORENAME "ARMV8" +#else +#endif + + #ifndef FORCE #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ @@ -719,6 +765,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __arm__ +#include "cpuid_arm.c" +#define OPENBLAS_SUPPORTED +#endif + + #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif @@ -773,7 +825,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -788,6 +840,11 @@ int main(int argc, char *argv[]){ printf("NUM_CORES=%d\n", get_num_cores()); +#if defined(__arm__) && !defined(FORCE) + get_features(); +#endif + + #if defined(__i386__) || defined(__x86_64__) #ifndef FORCE get_sse(); From eab79631c34f2f8a1ba4f5ff9ca5a9a6f216327e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:35:18 +0100 Subject: [PATCH 114/127] modified c_check --- c_check | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/c_check b/c_check index d5fe59f75..0828a5bba 100644 --- a/c_check +++ b/c_check @@ -63,6 +63,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $defined = 0; @@ -149,6 +151,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); From d844901062c94343356146bc9e8c5a0f1404bf15 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:38:58 +0100 Subject: [PATCH 115/127] modified Makefile.rule --- Makefile.rule | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index c8433288b..e357d5ccc 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -12,33 +12,33 @@ VERSION = 0.2.8 # You can specify the target architecture, otherwise it's # automatically detected. -TARGET = ARMV6 +# TARGET = PENRYN # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. -CC = gcc +# CC = gcc # Fortran compiler. Default is g77. -FC = gfortran +# FC = gfortran # Even you can specify cross compiler. Meanwhile, please set HOSTCC. -#CC = arm-linux-gnueabihf-gcc -#FC = arm-linux-gnueabihf-gfortran +# CC = x86_64-w64-mingw32-gcc +# FC = x86_64-w64-mingw32-gfortran # If you use the cross compiler, please set this host compiler. -HOSTCC = gcc +# HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 -#BINARY=32 +# BINARY=64 # About threaded BLAS. It will be automatically detected if you don't # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -#USE_THREAD = 0 +# USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. # USE_OPENMP = 1 @@ -46,7 +46,7 @@ HOSTCC = gcc # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. -NUM_THREADS = 16 +# NUM_THREADS = 24 # if you don't need generate the shared library, please comment it in. # NO_SHARED = 1 @@ -54,12 +54,16 @@ NUM_THREADS = 16 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 +# If you only want CBLAS interface without installing Fortran compiler, +# please comment it in. +# ONLY_CBLAS = 1 + # If you don't need LAPACK, please comment it in. # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. -#NO_LAPACK = 1 +# NO_LAPACK = 1 # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. -#NO_LAPACKE = 1 +# NO_LAPACKE = 1 # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -72,10 +76,10 @@ NUM_THREADS = 16 # Unfortunately most of kernel won't give us high quality buffer. # BLAS tries to find the best region before entering main function, # but it will consume time. If you don't like it, you can disable one. -NO_WARMUP = 1 +# NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -NO_AFFINITY = 1 +# NO_AFFINITY = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. @@ -123,13 +127,13 @@ NO_AFFINITY = 1 # Common Optimization Flag; # The default -O2 is enough. -#COMMON_OPT = -O3 -marm -mfpu=vfpv3 -mfloat-abi=hard +# COMMON_OPT = -O2 # Profiling flags COMMON_PROF = -pg # Build Debug version -DEBUG = 1 +# DEBUG = 1 # # End of user configuration From 9e38dbb658f71b393411c5189260a4d5a22ed7db Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:51:39 +0100 Subject: [PATCH 116/127] modified param.h --- param.h | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 117 insertions(+), 14 deletions(-) diff --git a/param.h b/param.h index 0628a1972..b865287be 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,39 +343,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1150,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM From ecbc85b9545397918ce418323ab2f89e09184d4b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:54:53 +0100 Subject: [PATCH 117/127] modified param.h --- param.h | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 240 insertions(+), 14 deletions(-) diff --git a/param.h b/param.h index 0c3df6951..b865287be 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,39 +343,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1150,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM @@ -1793,6 +1896,129 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + + + #ifdef GENERIC #define SNUMOPT 2 From ffe70b1fdc810e783ab295afd0f60325b33a5d66 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:58:46 +0100 Subject: [PATCH 118/127] modified Makefile.L3 --- kernel/Makefile.L3 | 202 ++++----------------------------------------- 1 file changed, 16 insertions(+), 186 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f8152ac50..b9b4bef1e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -14,6 +14,20 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(ARCH), arm) +USE_TRMM = 1 +endif + +ifeq ($(ARCH), arm64) +USE_TRMM = 1 +endif + +ifeq ($(TARGET), LOONGSON3B) +USE_TRMM = 1 +endif + + + SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ @@ -498,7 +512,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ -ifeq ($(TARGET), LOONGSON3B) + +ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -582,24 +597,6 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - -ifdef STRMMKERNEL - -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - - else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -613,93 +610,17 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -else - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -endif - -ifdef QTRMMKERNEL - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -else $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -713,36 +634,6 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -ifdef CTRMMKERNEL - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -767,37 +658,6 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -ifdef ZTRMMKERNEL - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - - -else - $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -821,37 +681,10 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - endif -endif - -ifdef XTRMMKERNEL - -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -else $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -877,9 +710,6 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - - $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ From ec2dadde9b22a1289a5891d530e41167274410ff Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 18:02:11 +0100 Subject: [PATCH 119/127] modified param.h --- param.h | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 240 insertions(+), 14 deletions(-) diff --git a/param.h b/param.h index 0c3df6951..b865287be 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,39 +343,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1150,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM @@ -1793,6 +1896,129 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + + + #ifdef GENERIC #define SNUMOPT 2 From d2385f0d524e6431374062cdcddeda8b83008e34 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 18:02:54 +0100 Subject: [PATCH 120/127] modified param.h --- param.h | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 227 insertions(+), 5 deletions(-) diff --git a/param.h b/param.h index c4d15323a..b865287be 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,7 +343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 768 @@ -380,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1154,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM @@ -1797,6 +1896,129 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + + + #ifdef GENERIC #define SNUMOPT 2 From 339fef2649de26f7f2a32c7120167782c38f4199 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 18:10:23 +0100 Subject: [PATCH 121/127] modified param.h --- param.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/param.h b/param.h index b865287be..b96375376 100644 --- a/param.h +++ b/param.h @@ -1154,6 +1154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HASWELL +<<<<<<< HEAD #define SNUMOPT 8 #define DNUMOPT 4 @@ -1164,6 +1165,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 4 +======= +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 +>>>>>>> origin/haswell #ifdef ARCH_X86 @@ -1233,6 +1246,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r +<<<<<<< HEAD +======= +//#define DGEMM_DEFAULT_R dgemm_r +>>>>>>> origin/haswell #define DGEMM_DEFAULT_R 13824 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r From aae75b2461c3a0b4648743282ec453a6ae9a3b8e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 18:43:24 +0100 Subject: [PATCH 122/127] modified param.h --- param.h | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/param.h b/param.h index b96375376..e4b095a84 100644 --- a/param.h +++ b/param.h @@ -1154,9 +1154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HASWELL -<<<<<<< HEAD -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 16 +#define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 @@ -1164,19 +1163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 -======= -#define SNUMOPT 8 -#define DNUMOPT 4 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SYMV_P 8 - #define SWITCH_RATIO 4 ->>>>>>> origin/haswell #ifdef ARCH_X86 @@ -1246,10 +1233,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r -<<<<<<< HEAD -======= -//#define DGEMM_DEFAULT_R dgemm_r ->>>>>>> origin/haswell #define DGEMM_DEFAULT_R 13824 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r From 27d4234d4d02858e93456a24a2e5f33da6223415 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 20:56:02 +0100 Subject: [PATCH 123/127] merged symv --- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index aad66b4ac..5083d0b01 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index 1b28bcd20..a8bbb1cad 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 12aab61ed..47af7726a 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index d044dfbe2..57d8c2a20 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) From 034a5b20837129d131ee485365536e5473a663b5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 21:07:49 +0100 Subject: [PATCH 124/127] modified zsymv --- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index f818170a3..204e5e6ab 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 7b6c5976d..5769d242a 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 6dabf0735..6f782b1e2 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index df64d8045..f92779e24 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) From c6156b2ef2ed55178132adeeb1dc72b12d9f9f16 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 22:39:39 +0100 Subject: [PATCH 125/127] added trsm kernels from origin --- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index b620f6d12..ebd1377f1 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 3823aa08b..6fa7d410e 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index e60444b08..9ce4cd8d4 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index f19047d14..a1a35a7a5 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 4a87539bf..a5333640d 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 79222a07a..c3619ec3d 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 0a646e980..53e53c3ce 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 58d8b4d6d..3c056cdff 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 1604da17f..1efa1fd25 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index fce813eb4..849afed73 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index fda1a9860..c1833abe2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif From 9423f980f64182353ed218d6f9631f5f92f04a1c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 2 Dec 2013 10:08:14 +0100 Subject: [PATCH 126/127] modified trsm kernel --- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index ebd1377f1..16ba9a0e3 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 6fa7d410e..03f8e3d79 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 9ce4cd8d4..65a6cf091 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index a1a35a7a5..d27880b8d 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index a5333640d..ff8231e16 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index c3619ec3d..857866552 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 53e53c3ce..6d9880556 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 3c056cdff..452e3bf87 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 1efa1fd25..64232fdfb 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 849afed73..71246d7a6 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index c1833abe2..dfd555c88 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif From 53eaf419013c215b033fbc4254bd54d5fdeb83f2 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 2 Dec 2013 13:17:51 +0100 Subject: [PATCH 127/127] added support for HASWELL --- kernel/setparam-ref.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 4a7526196..cf868f103 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -762,6 +762,23 @@ static void init_parameter(void) { #endif #endif +#ifdef HASWELL + +#ifdef DEBUG + fprintf(stderr, "Haswell\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + #ifdef OPTERON #ifdef DEBUG