You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sbgemm_kernel_8x4_neoversen2_impl.c 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665
  1. /***************************************************************************
  2. * Copyright (c) 2022, The OpenBLAS Project
  3. * All rights reserved.
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. * 1. Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * 2. Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in
  11. * the documentation and/or other materials provided with the
  12. * distribution.
  13. * 3. Neither the name of the OpenBLAS project nor the names of
  14. * its contributors may be used to endorse or promote products
  15. * derived from this software without specific prior written permission.
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26. * POSSIBILITY OF SUCH DAMAGE.
  27. * *****************************************************************************/
  28. #include <arm_sve.h>
  29. #include "common.h"
  30. #ifdef ALPHA_ONE
  31. #define LOAD_C(M, N) \
  32. mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
  33. #define LOAD_C_LOW(M, N) \
  34. mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc);
  35. #define LOAD_C_EVEN(M, N) \
  36. mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc);
  37. #define LOAD_C_FIRST(M, N) \
  38. mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc);
  39. #define STORE_C(M, N) \
  40. svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  41. #define STORE_C_LOW(M, N) \
  42. svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  43. #define STORE_C_EVEN(M, N) \
  44. svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  45. #define STORE_C_FIRST(M, N) \
  46. svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  47. #else
  48. #define LOAD_C(M, N) \
  49. mc##M##N = svdup_f32(0); \
  50. oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
  51. #define LOAD_C_LOW(M, N) \
  52. mc##M##N = svdup_f32(0); \
  53. oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc);
  54. #define LOAD_C_EVEN(M, N) \
  55. mc##M##N = svdup_f32(0); \
  56. oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc);
  57. #define LOAD_C_FIRST(M, N) \
  58. mc##M##N = svdup_f32(0); \
  59. oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc);
  60. #define STORE_C(M, N) \
  61. mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \
  62. svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  63. #define STORE_C_LOW(M, N) \
  64. mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \
  65. svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  66. #define STORE_C_EVEN(M, N) \
  67. mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \
  68. svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  69. #define STORE_C_FIRST(M, N) \
  70. mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \
  71. svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
  72. #endif
  73. #define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M);
  74. #define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N);
  75. #define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
  76. #define LOAD_KREST_1(NAME, M) \
  77. m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \
  78. *(ptr_##NAME##M + 1), zero, zero, zero);
  79. #define LOAD_KREST_1_LOW(NAME, M) \
  80. m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \
  81. zero, zero);
  82. #define LOAD_KREST_2(NAME, M) \
  83. m##NAME##M = \
  84. svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \
  85. *(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero);
  86. #define LOAD_KREST_2_LOW(NAME, M) \
  87. m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \
  88. zero, zero, zero, zero, zero);
  89. #define LOAD_KREST_3(NAME, M) \
  90. m##NAME##M = \
  91. svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
  92. *(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \
  93. *(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero);
  94. #define LOAD_KREST_3_LOW(NAME, M) \
  95. m##NAME##M = \
  96. svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
  97. *(ptr_##NAME##M + 2), zero, zero, zero, zero, zero);
  98. #ifdef ALPHA_ONE
  99. int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
  100. #else
  101. int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
  102. #endif
  103. {
  104. bfloat16_t *ptr_a = (bfloat16_t *)A;
  105. bfloat16_t *ptr_b = (bfloat16_t *)B;
  106. FLOAT *ptr_c = C;
  107. bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
  108. bfloat16_t *ptr_b0, *ptr_b1;
  109. FLOAT *ptr_c00, *ptr_c01;
  110. svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
  111. svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31;
  112. #ifndef ALPHA_ONE
  113. svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31;
  114. #endif
  115. svbool_t pg16 = svptrue_b16();
  116. svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
  117. svbool_t pg32 = svptrue_b32();
  118. svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
  119. svbool_t pg32_even = svdupq_b32(1, 0, 1, 0);
  120. svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
  121. svfloat32_t svalpha = svdup_f32(alpha);
  122. bfloat16 tmp = 0;
  123. bfloat16_t zero = *((bfloat16_t *)&tmp);
  124. BLASLONG krest = k & 3;
  125. // 00 01 10 11
  126. svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1);
  127. for (BLASLONG j = 0; j < n / 4; j++) {
  128. ptr_c00 = ptr_c;
  129. ptr_c01 = ptr_c + 2 * ldc;
  130. ptr_c += 4 * ldc;
  131. ptr_a = (bfloat16_t *)A;
  132. for (BLASLONG i = 0; i < m / 8; i++) {
  133. ptr_a0 = ptr_a;
  134. ptr_a1 = ptr_a0 + 2 * k;
  135. ptr_a2 = ptr_a1 + 2 * k;
  136. ptr_a3 = ptr_a2 + 2 * k;
  137. ptr_a += 8 * k;
  138. ptr_b0 = ptr_b;
  139. ptr_b1 = ptr_b0 + 2 * k;
  140. LOAD_C(0, 0); LOAD_C(0, 1);
  141. LOAD_C(1, 0); LOAD_C(1, 1);
  142. LOAD_C(2, 0); LOAD_C(2, 1);
  143. LOAD_C(3, 0); LOAD_C(3, 1);
  144. for (BLASLONG p = 0; p < k / 4; p++) {
  145. LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
  146. LOAD_B(0); LOAD_B(1);
  147. MATMUL(0, 0); MATMUL(0, 1);
  148. MATMUL(1, 0); MATMUL(1, 1);
  149. MATMUL(2, 0); MATMUL(2, 1);
  150. MATMUL(3, 0); MATMUL(3, 1);
  151. ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
  152. ptr_b0 += 8; ptr_b1 += 8;
  153. }
  154. if (krest) {
  155. if (krest == 1) {
  156. LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
  157. LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
  158. LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
  159. } else if (krest == 2) {
  160. LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
  161. LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
  162. LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
  163. } else if (krest == 3) {
  164. LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
  165. LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
  166. LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
  167. }
  168. MATMUL(0, 0); MATMUL(0, 1);
  169. MATMUL(1, 0); MATMUL(1, 1);
  170. MATMUL(2, 0); MATMUL(2, 1);
  171. MATMUL(3, 0); MATMUL(3, 1);
  172. }
  173. STORE_C(0, 0); STORE_C(0, 1);
  174. STORE_C(1, 0); STORE_C(1, 1);
  175. STORE_C(2, 0); STORE_C(2, 1);
  176. STORE_C(3, 0); STORE_C(3, 1);
  177. ptr_c00 += 8; ptr_c01 += 8;
  178. }
  179. if (m & 4) {
  180. ptr_a0 = ptr_a;
  181. ptr_a1 = ptr_a0 + 2 * k;
  182. ptr_a += 4 * k;
  183. ptr_b0 = ptr_b;
  184. ptr_b1 = ptr_b0 + 2 * k;
  185. LOAD_C(0, 0); LOAD_C(0, 1);
  186. LOAD_C(1, 0); LOAD_C(1, 1);
  187. for (BLASLONG p = 0; p < k / 4; p++) {
  188. LOAD_A(0); LOAD_A(1);
  189. LOAD_B(0); LOAD_B(1);
  190. MATMUL(0, 0); MATMUL(0, 1);
  191. MATMUL(1, 0); MATMUL(1, 1);
  192. ptr_a0 += 8; ptr_a1 += 8;
  193. ptr_b0 += 8; ptr_b1 += 8;
  194. }
  195. if (krest) {
  196. if (krest == 1) {
  197. LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
  198. LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
  199. } else if (krest == 2) {
  200. LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
  201. LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
  202. } else if (krest == 3) {
  203. LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
  204. LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
  205. }
  206. MATMUL(0, 0); MATMUL(0, 1);
  207. MATMUL(1, 0); MATMUL(1, 1);
  208. }
  209. STORE_C(0, 0); STORE_C(0, 1);
  210. STORE_C(1, 0); STORE_C(1, 1);
  211. ptr_c00 += 4; ptr_c01 += 4;
  212. }
  213. if (m & 2) {
  214. ptr_a0 = ptr_a;
  215. ptr_a += 2 * k;
  216. ptr_b0 = ptr_b;
  217. ptr_b1 = ptr_b0 + 2 * k;
  218. LOAD_C(0, 0); LOAD_C(0, 1);
  219. for (BLASLONG p = 0; p < k / 4; p++) {
  220. LOAD_A(0);
  221. LOAD_B(0); LOAD_B(1);
  222. MATMUL(0, 0); MATMUL(0, 1);
  223. ptr_a0 += 8;
  224. ptr_b0 += 8; ptr_b1 += 8;
  225. }
  226. if (krest) {
  227. if (krest == 1) {
  228. LOAD_KREST_1(a, 0);
  229. LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
  230. } else if (krest == 2) {
  231. LOAD_KREST_2(a, 0);
  232. LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
  233. } else if (krest == 3) {
  234. LOAD_KREST_3(a, 0);
  235. LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
  236. }
  237. MATMUL(0, 0); MATMUL(0, 1);
  238. }
  239. STORE_C(0, 0); STORE_C(0, 1);
  240. ptr_c00 += 2; ptr_c01 += 2;
  241. }
  242. if (m & 1) {
  243. ptr_a0 = ptr_a;
  244. ptr_b0 = ptr_b;
  245. ptr_b1 = ptr_b0 + 2 * k;
  246. LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1);
  247. for (BLASLONG p = 0; p < k / 4; p++) {
  248. ma0 = svld1_bf16(pg16_low, ptr_a0);
  249. LOAD_B(0); LOAD_B(1);
  250. MATMUL(0, 0); MATMUL(0, 1);
  251. ptr_a0 += 4;
  252. ptr_b0 += 8;
  253. ptr_b1 += 8;
  254. }
  255. if (krest) {
  256. if (krest == 1) {
  257. LOAD_KREST_1_LOW(a, 0);
  258. LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
  259. } else if (krest == 2) {
  260. LOAD_KREST_2_LOW(a, 0);
  261. LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
  262. } else if (krest == 3) {
  263. LOAD_KREST_3_LOW(a, 0);
  264. LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
  265. }
  266. MATMUL(0, 0); MATMUL(0, 1);
  267. }
  268. STORE_C_LOW(0, 0); STORE_C_LOW(0, 1);
  269. }
  270. ptr_b += 4 * k;
  271. }
  272. if (n & 2) {
  273. ptr_c00 = ptr_c;
  274. ptr_c += 2 * ldc;
  275. ptr_a = (bfloat16_t *)A;
  276. for (BLASLONG i = 0; i < m / 8; i++) {
  277. ptr_a0 = ptr_a;
  278. ptr_a1 = ptr_a0 + 2 * k;
  279. ptr_a2 = ptr_a1 + 2 * k;
  280. ptr_a3 = ptr_a2 + 2 * k;
  281. ptr_a += 8 * k;
  282. ptr_b0 = ptr_b;
  283. LOAD_C(0, 0);
  284. LOAD_C(1, 0);
  285. LOAD_C(2, 0);
  286. LOAD_C(3, 0);
  287. for (BLASLONG p = 0; p < k / 4; p++) {
  288. LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
  289. LOAD_B(0);
  290. MATMUL(0, 0);
  291. MATMUL(1, 0);
  292. MATMUL(2, 0);
  293. MATMUL(3, 0);
  294. ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
  295. ptr_b0 += 8;
  296. }
  297. if (krest) {
  298. if (krest == 1) {
  299. LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
  300. LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
  301. LOAD_KREST_1(b, 0);
  302. } else if (krest == 2) {
  303. LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
  304. LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
  305. LOAD_KREST_2(b, 0);
  306. } else if (krest == 3) {
  307. LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
  308. LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
  309. LOAD_KREST_3(b, 0);
  310. }
  311. MATMUL(0, 0);
  312. MATMUL(1, 0);
  313. MATMUL(2, 0);
  314. MATMUL(3, 0);
  315. }
  316. STORE_C(0, 0);
  317. STORE_C(1, 0);
  318. STORE_C(2, 0);
  319. STORE_C(3, 0);
  320. ptr_c00 += 8;
  321. }
  322. if (m & 4) {
  323. ptr_a0 = ptr_a;
  324. ptr_a1 = ptr_a0 + 2 * k;
  325. ptr_a += 4 * k;
  326. ptr_b0 = ptr_b;
  327. LOAD_C(0, 0);
  328. LOAD_C(1, 0);
  329. for (BLASLONG p = 0; p < k / 4; p++) {
  330. LOAD_A(0); LOAD_A(1);
  331. LOAD_B(0);
  332. MATMUL(0, 0);
  333. MATMUL(1, 0);
  334. ptr_a0 += 8; ptr_a1 += 8;
  335. ptr_b0 += 8;
  336. }
  337. if (krest) {
  338. if (krest == 1) {
  339. LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
  340. LOAD_KREST_1(b, 0);
  341. } else if (krest == 2) {
  342. LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
  343. LOAD_KREST_2(b, 0);
  344. } else if (krest == 3) {
  345. LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
  346. LOAD_KREST_3(b, 0);
  347. }
  348. MATMUL(0, 0);
  349. MATMUL(1, 0);
  350. }
  351. STORE_C(0, 0)
  352. STORE_C(1, 0)
  353. ptr_c00 += 4;
  354. }
  355. if (m & 2) {
  356. ptr_a0 = ptr_a;
  357. ptr_a += 2 * k;
  358. ptr_b0 = ptr_b;
  359. LOAD_C(0, 0);
  360. for (BLASLONG p = 0; p < k / 4; p++) {
  361. LOAD_A(0);
  362. LOAD_B(0);
  363. MATMUL(0, 0);
  364. ptr_a0 += 8;
  365. ptr_b0 += 8;
  366. }
  367. if (krest) {
  368. if (krest == 1) {
  369. LOAD_KREST_1(a, 0);
  370. LOAD_KREST_1(b, 0);
  371. } else if (krest == 2) {
  372. LOAD_KREST_2(a, 0);
  373. LOAD_KREST_2(b, 0);
  374. } else if (krest == 3) {
  375. LOAD_KREST_3(a, 0);
  376. LOAD_KREST_3(b, 0);
  377. }
  378. MATMUL(0, 0);
  379. }
  380. STORE_C(0, 0);
  381. ptr_c00 += 2;
  382. }
  383. if (m & 1) {
  384. ptr_a0 = ptr_a;
  385. ptr_b0 = ptr_b;
  386. LOAD_C(0, 0);
  387. for (BLASLONG p = 0; p < k / 4; p++) {
  388. ma0 = svld1_bf16(pg16_low, ptr_a0);
  389. LOAD_B(0);
  390. MATMUL(0, 0);
  391. ptr_a0 += 4;
  392. ptr_b0 += 8;
  393. }
  394. if (krest) {
  395. if (krest == 1) {
  396. LOAD_KREST_1_LOW(a, 0);
  397. LOAD_KREST_1(b, 0);
  398. } else if (krest == 2) {
  399. LOAD_KREST_2_LOW(a, 0);
  400. LOAD_KREST_2(b, 0);
  401. } else if (krest == 3) {
  402. LOAD_KREST_3_LOW(a, 0);
  403. LOAD_KREST_3(b, 0);
  404. }
  405. MATMUL(0, 0);
  406. }
  407. STORE_C_LOW(0, 0);
  408. }
  409. ptr_b += 2 * k;
  410. }
  411. if (n & 1) {
  412. ptr_c00 = ptr_c;
  413. ptr_a = (bfloat16_t *) A;
  414. for (BLASLONG i = 0; i < m / 8; i++) {
  415. ptr_a0 = ptr_a;
  416. ptr_a1 = ptr_a0 + 2 * k;
  417. ptr_a2 = ptr_a1 + 2 * k;
  418. ptr_a3 = ptr_a2 + 2 * k;
  419. ptr_a += 8 * k;
  420. ptr_b0 = ptr_b;
  421. LOAD_C_EVEN(0, 0);
  422. LOAD_C_EVEN(1, 0);
  423. LOAD_C_EVEN(2, 0);
  424. LOAD_C_EVEN(3, 0);
  425. for (BLASLONG p = 0; p < k / 4; p++) {
  426. LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
  427. mb0 = svld1_bf16(pg16_low, ptr_b0);
  428. MATMUL(0, 0);
  429. MATMUL(1, 0);
  430. MATMUL(2, 0);
  431. MATMUL(3, 0);
  432. ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
  433. ptr_b0 += 4;
  434. }
  435. if (krest) {
  436. if (krest == 1) {
  437. LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
  438. LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
  439. LOAD_KREST_1_LOW(b, 0);
  440. } else if (krest == 2) {
  441. LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
  442. LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
  443. LOAD_KREST_2_LOW(b, 0);
  444. } else if (krest == 3) {
  445. LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
  446. LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
  447. LOAD_KREST_3_LOW(b, 0);
  448. }
  449. MATMUL(0, 0);
  450. MATMUL(1, 0);
  451. MATMUL(2, 0);
  452. MATMUL(3, 0);
  453. }
  454. STORE_C_EVEN(0, 0)
  455. STORE_C_EVEN(1, 0);
  456. STORE_C_EVEN(2, 0);
  457. STORE_C_EVEN(3, 0);
  458. ptr_c00 += 8;
  459. }
  460. if (m & 4) {
  461. ptr_a0 = ptr_a;
  462. ptr_a1 = ptr_a0 + 2 * k;
  463. ptr_a += 4 * k;
  464. ptr_b0 = ptr_b;
  465. LOAD_C_EVEN(0, 0);
  466. LOAD_C_EVEN(1, 0);
  467. for (BLASLONG p = 0; p < k / 4; p++) {
  468. LOAD_A(0); LOAD_A(1);
  469. mb0 = svld1_bf16(pg16_low, ptr_b0);
  470. MATMUL(0, 0);
  471. MATMUL(1, 0);
  472. ptr_a0 += 8; ptr_a1 += 8;
  473. ptr_b0 += 4;
  474. }
  475. if (krest) {
  476. if (krest == 1) {
  477. LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
  478. LOAD_KREST_1_LOW(b, 0);
  479. } else if (krest == 2) {
  480. LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
  481. LOAD_KREST_2_LOW(b, 0);
  482. } else if (krest == 3) {
  483. LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
  484. LOAD_KREST_3_LOW(b, 0);
  485. }
  486. MATMUL(0, 0);
  487. MATMUL(1, 0);
  488. }
  489. STORE_C_EVEN(0, 0)
  490. STORE_C_EVEN(1, 0)
  491. ptr_c00 += 4;
  492. }
  493. if (m & 2) {
  494. ptr_a0 = ptr_a;
  495. ptr_a += 2 * k;
  496. ptr_b0 = ptr_b;
  497. LOAD_C_EVEN(0, 0);
  498. for (BLASLONG p = 0; p < k / 4; p++) {
  499. LOAD_A(0);
  500. mb0 = svld1_bf16(pg16_low, ptr_b0);
  501. MATMUL(0, 0);
  502. ptr_a0 += 8;
  503. ptr_b0 += 4;
  504. }
  505. if (krest) {
  506. if (krest == 1) {
  507. LOAD_KREST_1(a, 0);
  508. LOAD_KREST_1_LOW(b, 0);
  509. } else if (krest == 2) {
  510. LOAD_KREST_2(a, 0);
  511. LOAD_KREST_2_LOW(b, 0);
  512. } else if (krest == 3) {
  513. LOAD_KREST_3(a, 0);
  514. LOAD_KREST_3_LOW(b, 0);
  515. }
  516. MATMUL(0, 0);
  517. }
  518. STORE_C_EVEN(0, 0);
  519. ptr_c00 += 2;
  520. }
  521. if (m & 1) {
  522. ptr_a0 = ptr_a;
  523. ptr_b0 = ptr_b;
  524. LOAD_C_FIRST(0, 0);
  525. for (BLASLONG p = 0; p < k / 4; p++) {
  526. ma0 = svld1_bf16(pg16_low, ptr_a0);
  527. mb0 = svld1_bf16(pg16_low, ptr_b0);
  528. MATMUL(0, 0);
  529. ptr_a0 += 4;
  530. ptr_b0 += 4;
  531. }
  532. if (krest) {
  533. if (krest == 1) {
  534. LOAD_KREST_1_LOW(a, 0);
  535. LOAD_KREST_1_LOW(b, 0);
  536. } else if (krest == 2) {
  537. LOAD_KREST_2_LOW(a, 0);
  538. LOAD_KREST_2_LOW(b, 0);
  539. } else if (krest == 3) {
  540. LOAD_KREST_3_LOW(a, 0);
  541. LOAD_KREST_3_LOW(b, 0);
  542. }
  543. MATMUL(0, 0);
  544. }
  545. STORE_C_FIRST(0, 0);
  546. }
  547. }
  548. return 0;
  549. }