You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_8x4_haswell.c 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = &alpha, %7 = b_pref */
  2. /* r11 = m_counter, r12 = k << 2(const), r13 = k_skip << 2, r14 = b_head_pos(const), r15 for assisting prefetch */
  3. //recommended settings: GEMM_P = 320, GEMM_Q = 320.
  4. #ifdef TRMMKERNEL
  5. #define mult_alpha(acc,alpha,...) "vmulps "#acc","#alpha","#acc";"
  6. #else
  7. #define mult_alpha(acc,alpha,...) "vfmadd213ps ("#__VA_ARGS__"),"#alpha","#acc";"
  8. #endif
  9. #if defined(TRMMKERNEL) && !defined(LEFT)
  10. #ifdef TRANSA
  11. #define HEAD_SET_OFFSET(ndim) {}
  12. #define TAIL_SET_OFFSET(ndim) {off+=ndim;}
  13. #else
  14. #define HEAD_SET_OFFSET(ndim) {off+=(ndim>4?4:ndim);}
  15. #define TAIL_SET_OFFSET(ndim) {off+=(ndim>4?(ndim-4):0);}
  16. #endif
  17. #else
  18. #define HEAD_SET_OFFSET(ndim) {}
  19. #define TAIL_SET_OFFSET(ndim) {}
  20. #endif
  21. #if defined(TRMMKERNEL) && defined(LEFT)
  22. #ifdef TRANSA
  23. #define init_update_kskip(val) "subq $"#val",%%r13;"
  24. #define save_update_kskip(val) ""
  25. #else
  26. #define init_update_kskip(val) ""
  27. #define save_update_kskip(val) "addq $"#val",%%r13;"
  28. #endif
  29. #else
  30. #define init_update_kskip(val) ""
  31. #define save_update_kskip(val) ""
  32. #endif
  33. #ifdef TRMMKERNEL
  34. #define init_set_k "movq %%r12,%4; subq %%r13,%4;"
  35. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  36. #define INIT_SET_KSKIP "movq %9,%%r13; salq $2,%%r13;"
  37. #define init_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;"
  38. #define save_set_pointers(a_copy,b_copy) ""
  39. #else
  40. #define INIT_SET_KSKIP "movq %4,%%r13; subq %9,%%r13; salq $2,%%r13;"
  41. #define init_set_pointers(a_copy,b_copy) ""
  42. #define save_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;"
  43. #endif
  44. #else
  45. #define INIT_SET_KSKIP "xorq %%r13,%%r13;"
  46. #define init_set_k "movq %%r12,%4;"
  47. #define init_set_pointers(a_copy,b_copy) ""
  48. #define save_set_pointers(a_copy,b_copy) ""
  49. #endif
  50. #define init_set_pa_pb_n12(mdim) init_set_pointers(mdim,4)
  51. #define init_set_pa_pb_n8(mdim) init_set_pointers(mdim,4)
  52. #define init_set_pa_pb_n4(mdim) init_set_pointers(mdim,4)
  53. #define init_set_pa_pb_n2(mdim) init_set_pointers(mdim,2)
  54. #define init_set_pa_pb_n1(mdim) init_set_pointers(mdim,1)
  55. #define save_set_pa_pb_n12(mdim) save_set_pointers(mdim,4)
  56. #define save_set_pa_pb_n8(mdim) save_set_pointers(mdim,4)
  57. #define save_set_pa_pb_n4(mdim) save_set_pointers(mdim,4)
  58. #define save_set_pa_pb_n2(mdim) save_set_pointers(mdim,2)
  59. #define save_set_pa_pb_n1(mdim) save_set_pointers(mdim,1)
  60. #if defined(TRMMKERNEL) && !defined(LEFT) && defined(TRANSA)
  61. #define kernel_kstart_n8(mdim) \
  62. KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $16,%4;"
  63. #define kernel_kstart_n12(mdim) \
  64. KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4\
  65. KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 "subq $32,%4;"
  66. #else
  67. #define kernel_kstart_n8(mdim) ""
  68. #define kernel_kstart_n12(mdim) ""
  69. #endif
  70. #define kernel_kstart_n4(mdim) ""
  71. #define kernel_kstart_n2(mdim) ""
  72. #define kernel_kstart_n1(mdim) ""
  73. /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
  74. #define KERNEL_k1m8n1 \
  75. "vmovups (%0),%%ymm1; addq $32,%0;"\
  76. "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
  77. "addq $4,%1;"
  78. #define KERNEL_h_k1m8n2 \
  79. "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
  80. "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
  81. #define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;"
  82. #define KERNEL_h_k1m8n4 \
  83. KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
  84. #define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
  85. #define unit_kernel_k1m8n4(c1,c2,c3,c4,boff1,boff2,...) \
  86. "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
  87. "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
  88. #define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,8,%1,%%r12,4)
  89. #define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;"
  90. #define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,8,%1,%%r12,8)
  91. #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;"
  92. #define KERNEL_k2m8n1 KERNEL_k1m8n1 KERNEL_k1m8n1
  93. #define KERNEL_k2m8n2 KERNEL_k1m8n2 KERNEL_k1m8n2
  94. #define KERNEL_k2m8n4 KERNEL_k1m8n4 KERNEL_k1m8n4
  95. #define KERNEL_k2m8n8 KERNEL_k1m8n8 KERNEL_k1m8n8
  96. #define KERNEL_k2m8n12 \
  97. "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
  98. unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,8,%1)\
  99. unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,8,%1,%%r12,4)\
  100. unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,8,%1,%%r12,8)\
  101. "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\
  102. unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,16,24,%1)\
  103. unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,24,%1,%%r12,4)\
  104. unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,16,24,%1,%%r12,8) "addq $32,%1;"
  105. #if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
  106. #define unit_kernel_endn4_k1m8n8(offa1,offb1,offb2) \
  107. "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\
  108. unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,offb1,offb2,%1,%%r12,4)
  109. #define unit_kernel_endn4_k1m8n12(offa1,offb1,offb2) \
  110. "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\
  111. unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,offb1,offb2,%1,%%r12,8)
  112. #define unit_kernel_endn8_k1m8n12(offa1,offb1,offb2) unit_kernel_endn4_k1m8n8(offa1,offb1,offb2)\
  113. unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,offb1,offb2,%1,%%r12,8)
  114. #define kernel_kend_m8n8 \
  115. unit_kernel_endn4_k1m8n8(0,0,8) unit_kernel_endn4_k1m8n8(32,16,24)\
  116. unit_kernel_endn4_k1m8n8(64,32,40) unit_kernel_endn4_k1m8n8(96,48,56)
  117. #define kernel_kend_m8n12 \
  118. unit_kernel_endn8_k1m8n12(0,0,8) unit_kernel_endn8_k1m8n12(32,16,24)\
  119. unit_kernel_endn8_k1m8n12(64,32,40) unit_kernel_endn8_k1m8n12(96,48,56)\
  120. unit_kernel_endn4_k1m8n12(128,64,72) unit_kernel_endn4_k1m8n12(160,80,88)\
  121. unit_kernel_endn4_k1m8n12(192,96,104) unit_kernel_endn4_k1m8n12(224,112,120)
  122. #else
  123. #define kernel_kend_m8n8 ""
  124. #define kernel_kend_m8n12 ""
  125. #endif
  126. #define kernel_kend_m8n4 ""
  127. #define kernel_kend_m8n2 ""
  128. #define kernel_kend_m8n1 ""
  129. #define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
  130. #define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
  131. #define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
  132. #define unit_init_m8n4(c1,c2,c3,c4) \
  133. "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
  134. #define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
  135. #define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
  136. #define SAVE_m8n1 mult_alpha(%%ymm4,%%ymm0,%2) "vmovups %%ymm4,(%2);"
  137. #define unit_save_m8n2(c1,c2) \
  138. "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\
  139. mult_alpha(c1,%%ymm0,%5) "vmovups "#c1",(%5);"\
  140. mult_alpha(c2,%%ymm0,%5,%3,1) "vmovups "#c2",(%5,%3,1);"\
  141. "leaq (%5,%3,2),%5;"
  142. #define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
  143. #define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7)
  144. #define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
  145. #define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
  146. #define COMPUTE_m8(ndim) \
  147. init_update_kskip(32) INIT_m8n##ndim\
  148. init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(8) "movq %2,%5; movq $0,%%r15;"\
  149. kernel_kstart_n##ndim(8)\
  150. "cmpq $64,%4; jb "#ndim"882f;"\
  151. #ndim"881:\n\t"\
  152. "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\
  153. KERNEL_k2m8n##ndim KERNEL_k2m8n##ndim\
  154. "prefetcht1 (%5); subq $31,%5;"\
  155. KERNEL_k2m8n##ndim KERNEL_k2m8n##ndim\
  156. "addq %%r15,%5; prefetcht1 (%7); addq $16,%7;"\
  157. "subq $32,%4; cmpq $64,%4; jnb "#ndim"881b;"\
  158. "movq %2,%5;"\
  159. #ndim"882:\n\t"\
  160. "testq %4,%4; jz "#ndim"883f;"\
  161. "prefetcht0 (%5); prefetcht0 31(%5);"\
  162. KERNEL_k1m8n##ndim\
  163. "prefetcht0 (%5,%3,4); prefetcht0 31(%5,%3,4); addq %3,%5;"\
  164. "subq $4,%4; jmp "#ndim"882b;"\
  165. #ndim"883:\n\t"\
  166. kernel_kend_m8n##ndim "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
  167. save_set_pa_pb_n##ndim(8) SAVE_m8n##ndim "addq $32,%2;" save_update_kskip(32)
  168. /* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
  169. #define KERNEL_k1m4n1 \
  170. "vmovups (%0),%%xmm1; addq $16,%0;"\
  171. "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
  172. "addq $4,%1;"
  173. #define KERNEL_h_k1m4n2 \
  174. "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
  175. "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
  176. #define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;"
  177. #define KERNEL_h_k1m4n4 \
  178. KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
  179. #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
  180. #define unit_kernel_k1m4n4(c1,c2,c3,c4,offb1,offb2,...) \
  181. "vmovddup "#offb1"("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
  182. "vmovddup "#offb2"("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
  183. #define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,0,8,%1,%%r12,4)
  184. #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;"
  185. #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,0,8,%1,%%r12,8)
  186. #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;"
  187. #if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
  188. #define unit_kernel_endn4_k1m4n8(offa1,offb1,offb2) \
  189. "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\
  190. unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,offb1,offb2,%1,%%r12,4)
  191. #define unit_kernel_endn4_k1m4n12(offa1,offb1,offb2) \
  192. "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\
  193. unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,offb1,offb2,%1,%%r12,8)
  194. #define unit_kernel_endn8_k1m4n12(offa1,offb1,offb2) unit_kernel_endn4_k1m4n8(offa1,offb1,offb2)\
  195. unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,offb1,offb2,%1,%%r12,8)
  196. #define kernel_kend_m4n8 \
  197. unit_kernel_endn4_k1m4n8(0,0,8) unit_kernel_endn4_k1m4n8(16,16,24)\
  198. unit_kernel_endn4_k1m4n8(32,32,40) unit_kernel_endn4_k1m4n8(48,48,56)
  199. #define kernel_kend_m4n12 \
  200. unit_kernel_endn8_k1m4n12(0,0,8) unit_kernel_endn8_k1m4n12(16,16,24)\
  201. unit_kernel_endn8_k1m4n12(32,32,40) unit_kernel_endn8_k1m4n12(48,48,56)\
  202. unit_kernel_endn4_k1m4n12(64,64,72) unit_kernel_endn4_k1m4n12(80,80,88)\
  203. unit_kernel_endn4_k1m4n12(96,96,104) unit_kernel_endn4_k1m4n12(112,112,120)
  204. #else
  205. #define kernel_kend_m4n8 ""
  206. #define kernel_kend_m4n12 ""
  207. #endif
  208. #define kernel_kend_m4n4 ""
  209. #define kernel_kend_m4n2 ""
  210. #define kernel_kend_m4n1 ""
  211. #define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
  212. #define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
  213. #define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
  214. #define unit_init_m4n4(c1,c2,c3,c4) \
  215. "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
  216. #define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
  217. #define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
  218. #define SAVE_m4n1 \
  219. mult_alpha(%%xmm4,%%xmm0,%2) "vmovups %%xmm4,(%2);"
  220. #define unit_save_m4n2(c1,c2) \
  221. "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\
  222. mult_alpha(c1,%%xmm0,%5) "vmovups "#c1",(%5);"\
  223. mult_alpha(c2,%%xmm0,%5,%3,1) "vmovups "#c2",(%5,%3,1);"\
  224. "leaq (%5,%3,2),%5;"
  225. #define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
  226. #define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7)
  227. #define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
  228. #define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
  229. #define COMPUTE_m4(ndim) \
  230. init_update_kskip(16) INIT_m4n##ndim\
  231. init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(4)\
  232. kernel_kstart_n##ndim(4)\
  233. #ndim"442:\n\t"\
  234. "testq %4,%4; jz "#ndim"443f;"\
  235. KERNEL_k1m4n##ndim\
  236. "subq $4,%4; jmp "#ndim"442b;"\
  237. #ndim"443:\n\t"\
  238. kernel_kend_m4n##ndim save_set_pa_pb_n##ndim(4) SAVE_m4n##ndim "addq $16,%2;" save_update_kskip(16)
  239. /* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
  240. #define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
  241. #define KERNEL_k1m2n1 \
  242. "vmovsd (%0),%%xmm1; addq $8,%0;"\
  243. "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
  244. "addq $4,%1;"
  245. #ifdef TRMMKERNEL
  246. #define SAVE_m2n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
  247. #else
  248. #define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
  249. #endif
  250. #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
  251. #define KERNEL_k1m2n2 \
  252. "vmovsd (%0),%%xmm1; addq $8,%0;"\
  253. "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
  254. "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
  255. "addq $8,%1;"
  256. #ifdef TRMMKERNEL
  257. #define SAVE_m2n2 SAVE_m2n1 "vmulps %%xmm5,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
  258. #else
  259. #define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
  260. #endif
  261. #define INIT_m2n4 INIT_m2n2
  262. #define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
  263. #define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
  264. #define KERNEL_k1m2n4 \
  265. "vmovups (%1),%%xmm3; addq $16,%1;"\
  266. "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
  267. "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
  268. "addq $8,%0;"
  269. #define KERNEL_k1m2n8 \
  270. "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\
  271. "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
  272. "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
  273. "addq $8,%0;"
  274. #define KERNEL_k1m2n12 \
  275. "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\
  276. "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
  277. "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
  278. "addq $8,%0;"
  279. #if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
  280. #define unit_kernel_endn4_k1m2n8(aoff1,aoff2,boff) \
  281. "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\
  282. "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;"\
  283. "vbroadcastss "#aoff2"(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm7;"
  284. #define unit_kernel_endn4_k1m2n12(aoff1,aoff2,boff) \
  285. "vmovups "#boff"(%1,%%r12,8),%%xmm3;"\
  286. "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm8;"\
  287. "vbroadcastss "#aoff2"(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm9;"
  288. #define unit_kernel_endn8_k1m2n12(aoff1,aoff2,boff) \
  289. "vmovups "#boff"(%1,%%r12,4),%%xmm3; vmovups "#boff"(%1,%%r12,8),%%xmm2;"\
  290. "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6; vfmadd231ps %%xmm2,%%xmm1,%%xmm8;"\
  291. "vbroadcastss "#aoff2"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm7; vfmadd231ps %%xmm2,%%xmm1,%%xmm9;"
  292. #define kernel_kend_m2n8 \
  293. unit_kernel_endn4_k1m2n8(0,4,0) unit_kernel_endn4_k1m2n8(8,12,16)\
  294. unit_kernel_endn4_k1m2n8(16,20,32) unit_kernel_endn4_k1m2n8(24,28,48)
  295. #define kernel_kend_m2n12 \
  296. unit_kernel_endn8_k1m2n12(0,4,0) unit_kernel_endn8_k1m2n12(8,12,16)\
  297. unit_kernel_endn8_k1m2n12(16,20,32) unit_kernel_endn8_k1m2n12(24,28,48)\
  298. unit_kernel_endn4_k1m2n12(32,36,64) unit_kernel_endn4_k1m2n12(40,44,80)\
  299. unit_kernel_endn4_k1m2n12(48,52,96) unit_kernel_endn4_k1m2n12(56,60,112)
  300. #else
  301. #define kernel_kend_m2n8 ""
  302. #define kernel_kend_m2n12 ""
  303. #endif
  304. #define kernel_kend_m2n4 ""
  305. #define kernel_kend_m2n2 ""
  306. #define kernel_kend_m2n1 ""
  307. #ifdef TRMMKERNEL
  308. #define unit_save_m2n4(c1,c2) \
  309. "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
  310. "vmulps %%xmm1,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
  311. "vmulps %%xmm2,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
  312. #else
  313. #define unit_save_m2n4(c1,c2) \
  314. "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
  315. "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
  316. "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
  317. "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
  318. "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
  319. #endif
  320. #define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
  321. #define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7)
  322. #define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9)
  323. #define COMPUTE_m2(ndim) \
  324. init_update_kskip(8) INIT_m2n##ndim\
  325. init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(2)\
  326. kernel_kstart_n##ndim(2)\
  327. #ndim"222:\n\t"\
  328. "testq %4,%4; jz "#ndim"223f;"\
  329. KERNEL_k1m2n##ndim\
  330. "subq $4,%4; jmp "#ndim"222b;"\
  331. #ndim"223:\n\t"\
  332. kernel_kend_m2n##ndim save_set_pa_pb_n##ndim(2) SAVE_m2n##ndim "addq $8,%2;" save_update_kskip(8)
  333. /* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
  334. #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
  335. #define KERNEL_k1m1n1 \
  336. "vmovss (%1),%%xmm3; addq $4,%1;"\
  337. "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
  338. "addq $4,%0;"
  339. #ifdef TRMMKERNEL
  340. #define SAVE_m1n1 "vmulss %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
  341. #else
  342. #define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
  343. #endif
  344. #define INIT_m1n2 INIT_m1n1
  345. #define KERNEL_k1m1n2 \
  346. "vmovsd (%1),%%xmm3; addq $8,%1;"\
  347. "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
  348. "addq $4,%0;"
  349. #ifdef TRMMKERNEL
  350. #define SAVE_m1n2 \
  351. "vmulps %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
  352. #else
  353. #define SAVE_m1n2 \
  354. "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
  355. "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
  356. #endif
  357. #define INIT_m1n4 INIT_m1n2
  358. #define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
  359. #define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
  360. #define KERNEL_k1m1n4 \
  361. "vmovups (%1),%%xmm3; addq $16,%1;"\
  362. "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
  363. "addq $4,%0;"
  364. #define KERNEL_k1m1n8 \
  365. "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\
  366. "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
  367. "addq $4,%0;"
  368. #define KERNEL_k1m1n12 \
  369. "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\
  370. "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
  371. "addq $4,%0;"
  372. #if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
  373. #define unit_kernel_endn4_k1m1n8(aoff,boff) \
  374. "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\
  375. "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5;"
  376. #define unit_kernel_endn4_k1m1n12(aoff,boff) \
  377. "vmovups "#boff"(%1,%%r12,8),%%xmm3;"\
  378. "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;"
  379. #define unit_kernel_endn8_k1m1n12(aoff,boff) \
  380. "vmovups "#boff"(%1,%%r12,4),%%xmm3; vmovups "#boff"(%1,%%r12,8),%%xmm2;"\
  381. "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"
  382. #define kernel_kend_m1n8 \
  383. unit_kernel_endn4_k1m1n8(0,0) unit_kernel_endn4_k1m1n8(4,16)\
  384. unit_kernel_endn4_k1m1n8(8,32) unit_kernel_endn4_k1m1n8(12,48)
  385. #define kernel_kend_m1n12 \
  386. unit_kernel_endn8_k1m1n12(0,0) unit_kernel_endn8_k1m1n12(4,16)\
  387. unit_kernel_endn8_k1m1n12(8,32) unit_kernel_endn8_k1m1n12(12,48)\
  388. unit_kernel_endn4_k1m1n12(16,64) unit_kernel_endn4_k1m1n12(20,80)\
  389. unit_kernel_endn4_k1m1n12(24,96) unit_kernel_endn4_k1m1n12(28,112)
  390. #else
  391. #define kernel_kend_m1n8 ""
  392. #define kernel_kend_m1n12 ""
  393. #endif
  394. #define kernel_kend_m1n4 ""
  395. #define kernel_kend_m1n2 ""
  396. #define kernel_kend_m1n1 ""
  397. #ifdef TRMMKERNEL
  398. #define unit_save_m1n4(c1) \
  399. "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
  400. "vmulps %%xmm2,%%xmm0,%%xmm2; vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
  401. "vmulps %%xmm1,%%xmm0,%%xmm1; vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
  402. #else
  403. #define unit_save_m1n4(c1) \
  404. "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
  405. "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
  406. "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
  407. "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
  408. "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
  409. #endif
  410. #define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
  411. #define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5)
  412. #define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6)
  413. #define COMPUTE_m1(ndim) \
  414. init_update_kskip(4) INIT_m1n##ndim\
  415. init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(1)\
  416. kernel_kstart_n##ndim(1)\
  417. #ndim"112:\n\t"\
  418. "testq %4,%4; jz "#ndim"113f;"\
  419. KERNEL_k1m1n##ndim\
  420. "subq $4,%4; jmp "#ndim"112b;"\
  421. #ndim"113:\n\t"\
  422. kernel_kend_m1n##ndim save_set_pa_pb_n##ndim(1) SAVE_m1n##ndim "addq $4,%2;" save_update_kskip(4)
  423. #define COMPUTE(ndim) {\
  424. HEAD_SET_OFFSET(ndim) next_b = b_pointer + ndim * K;\
  425. __asm__ __volatile__(\
  426. "vbroadcastss (%6),%%ymm0;"\
  427. "movq %4,%%r12; salq $2,%%r12; movq %1,%%r14; movq %8,%%r11;" INIT_SET_KSKIP\
  428. "cmpq $8,%%r11;jb 33101"#ndim"f;"\
  429. "33109"#ndim":\n\t"\
  430. COMPUTE_m8(ndim)\
  431. "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\
  432. "33101"#ndim":\n\t"\
  433. "cmpq $4,%%r11;jb 33103"#ndim"f;"\
  434. COMPUTE_m4(ndim)\
  435. "subq $4,%%r11;"\
  436. "33103"#ndim":\n\t"\
  437. "cmpq $2,%%r11;jb 33104"#ndim"f;"\
  438. COMPUTE_m2(ndim)\
  439. "subq $2,%%r11;"\
  440. "33104"#ndim":\n\t"\
  441. "testq %%r11,%%r11;jz 33105"#ndim"f;"\
  442. COMPUTE_m1(ndim)\
  443. "33105"#ndim":\n\t"\
  444. "movq %%r12,%4; sarq $2,%4; movq %%r14,%1; vzeroupper;"\
  445. :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(next_b)\
  446. :"m"(M),"m"(off):"r11","r12","r13","r14","r15",\
  447. "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
  448. TAIL_SET_OFFSET(ndim) a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\
  449. }
  450. #include "common.h"
  451. #include <stdint.h>
  452. int __attribute__ ((noinline))
  453. CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC
  454. #ifdef TRMMKERNEL
  455. ,BLASLONG offset
  456. #endif
  457. ){
  458. if(m==0||n==0) return 0;
  459. int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);
  460. float constval = alpha;
  461. float *const_val=&constval;
  462. int64_t M = (int64_t)m, K = (int64_t)k, off = 0;
  463. #ifdef TRMMKERNEL
  464. #ifdef LEFT
  465. off = offset;
  466. #else
  467. off = -offset;
  468. #endif
  469. #endif
  470. BLASLONG n_count = n;
  471. float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
  472. for(;n_count>11;n_count-=12) COMPUTE(12)
  473. for(;n_count>7;n_count-=8) COMPUTE(8)
  474. for(;n_count>3;n_count-=4) COMPUTE(4)
  475. for(;n_count>1;n_count-=2) COMPUTE(2)
  476. if(n_count>0) COMPUTE(1)
  477. return 0;
  478. }