You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4_c910v.c 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977
  1. #include "common.h"
  2. #include <riscv-vector.h>
  3. #define KERNEL8x4_I \
  4. "addi t1, %[PB], 1*8 \n\t"\
  5. "addi t2, %[PB], 2*8 \n\t"\
  6. "addi t3, %[PB], 3*8 \n\t"\
  7. "fld ft0, (%[PB]) \n\t"\
  8. "fld ft1, (t1) \n\t"\
  9. "fld ft2, (t2) \n\t"\
  10. "fld ft3, (t3) \n\t"\
  11. "vle.v v0, (%[PA]) \n\t"\
  12. "addi t4, %[PA], 2*8 \n\t"\
  13. "addi t5, %[PA], 4*8 \n\t"\
  14. "vfmv.v.f v8, ft0 \n\t"\
  15. "addi t6, %[PA], 6*8 \n\t"\
  16. "addi %[PA], %[PA], 8*8 \n\t"\
  17. "vle.v v1, (t4) \n\t"\
  18. "addi t4, t4, 8*8 \n\t"\
  19. "vfmv.v.f v9, ft1 \n\t"\
  20. "vle.v v2, (t5) \n\t"\
  21. "addi t5, t5, 8*8 \n\t"\
  22. "vle.v v3, (t6) \n\t"\
  23. "addi t6, t6, 8*8 \n\t"\
  24. "vfmv.v.f v10, ft2 \n\t"\
  25. "addi %[PB], %[PB], 4*8 \n\t"\
  26. "vle.v v4, (%[PA]) \n\t"\
  27. "addi %[PA], %[PA], 8*8 \n\t"\
  28. "vfmv.v.f v11, ft3 \n\t"\
  29. "vfmacc.vv v16, v8, v0 \n\t"\
  30. "addi t1, t1, 4*8 \n\t"\
  31. "vle.v v5, (t4) \n\t"\
  32. "addi t4, t4, 8*8 \n\t"\
  33. "vfmacc.vv v17, v8, v1 \n\t"\
  34. "addi t2, t2, 4*8 \n\t"\
  35. "vle.v v6, (t5) \n\t"\
  36. "addi t5, t5, 8*8 \n\t"\
  37. "vfmacc.vv v18, v8, v2 \n\t"\
  38. "addi t3, t3, 4*8 \n\t"\
  39. "vle.v v7, (t6) \n\t"\
  40. "addi t6, t6, 8*8 \n\t"\
  41. "vfmacc.vv v19, v8, v3 \n\t"\
  42. "fld ft4, (%[PB]) \n\t"\
  43. "vfmacc.vv v20, v9, v0 \n\t"\
  44. "fld ft5, (t1) \n\t"\
  45. "vfmacc.vv v21, v9, v1 \n\t"\
  46. "fld ft6, (t2) \n\t"\
  47. "vfmacc.vv v22, v9, v2 \n\t"\
  48. "fld ft7, (t3) \n\t"\
  49. "vfmacc.vv v23, v9, v3 \n\t"\
  50. "vfmv.v.f v12, ft4 \n\t"\
  51. "vfmacc.vv v24, v10, v0 \n\t"\
  52. "vfmv.v.f v13, ft5 \n\t"\
  53. "vfmacc.vv v25, v10, v1 \n\t"\
  54. "vfmv.v.f v14, ft6 \n\t"\
  55. "vfmacc.vv v26, v10, v2 \n\t"\
  56. "vfmv.v.f v15, ft7 \n\t"\
  57. "vfmacc.vv v27, v10, v3 \n\t"\
  58. "addi %[PB], %[PB], 4*8 \n\t"\
  59. "vfmacc.vv v28, v11, v0 \n\t"\
  60. "addi t1, t1, 4*8 \n\t"\
  61. "vfmacc.vv v29, v11, v1 \n\t"\
  62. "addi t2, t2, 4*8 \n\t"\
  63. "vfmacc.vv v30, v11, v2 \n\t"\
  64. "addi t3, t3, 4*8 \n\t"\
  65. "vfmacc.vv v31, v11, v3 \n\t"
  66. #define KERNEL8x4_M1 \
  67. "vfmacc.vv v16, v8, v0 \n\t"\
  68. "vle.v v4, (%[PA]) \n\t"\
  69. "addi %[PA], %[PA], 8*8 \n\t"\
  70. "vfmacc.vv v17, v8, v1 \n\t"\
  71. "vle.v v5, (t4) \n\t"\
  72. "addi t4, t4, 8*8 \n\t"\
  73. "vfmacc.vv v18, v8, v2 \n\t"\
  74. "vle.v v6, (t5) \n\t"\
  75. "addi t5, t5, 8*8 \n\t"\
  76. "vfmacc.vv v19, v8, v3 \n\t"\
  77. "vle.v v7, (t6) \n\t"\
  78. "addi t6, t6, 8*8 \n\t"\
  79. "vfmacc.vv v20, v9, v0 \n\t"\
  80. "fld ft4, (%[PB]) \n\t"\
  81. "vfmacc.vv v21, v9, v1 \n\t"\
  82. "fld ft5, (t1) \n\t"\
  83. "vfmacc.vv v22, v9, v2 \n\t"\
  84. "fld ft6, (t2) \n\t"\
  85. "vfmacc.vv v23, v9, v3 \n\t"\
  86. "fld ft7, (t3) \n\t"\
  87. "addi %[PB], %[PB], 4*8 \n\t"\
  88. "vfmacc.vv v24, v10, v0 \n\t"\
  89. "addi t1, t1, 4*8 \n\t"\
  90. "vfmacc.vv v25, v10, v1 \n\t"\
  91. "vfmv.v.f v12, ft4 \n\t"\
  92. "vfmacc.vv v26, v10, v2 \n\t"\
  93. "addi t2, t2, 4*8 \n\t"\
  94. "vfmacc.vv v27, v10, v3 \n\t"\
  95. "vfmv.v.f v13, ft5 \n\t"\
  96. "vfmacc.vv v28, v11, v0 \n\t"\
  97. "addi t3, t3, 4*8 \n\t"\
  98. "vfmacc.vv v29, v11, v1 \n\t"\
  99. "vfmv.v.f v14, ft6 \n\t"\
  100. "vfmacc.vv v30, v11, v2 \n\t"\
  101. "vfmacc.vv v31, v11, v3 \n\t"\
  102. "vfmv.v.f v15, ft7 \n\t"
  103. #define KERNEL8x4_M2 \
  104. "vfmacc.vv v16, v12, v4 \n\t"\
  105. "vle.v v0, (%[PA]) \n\t"\
  106. "addi %[PA], %[PA], 8*8 \n\t"\
  107. "vfmacc.vv v17, v12, v5 \n\t"\
  108. "vle.v v1, (t4) \n\t"\
  109. "addi t4, t4, 8*8 \n\t"\
  110. "vfmacc.vv v18, v12, v6 \n\t"\
  111. "vle.v v2, (t5) \n\t"\
  112. "addi t5, t5, 8*8 \n\t"\
  113. "vfmacc.vv v19, v12, v7 \n\t"\
  114. "vle.v v3, (t6) \n\t"\
  115. "addi t6, t6, 8*8 \n\t"\
  116. "vfmacc.vv v20, v13, v4 \n\t"\
  117. "fld ft0, (%[PB]) \n\t"\
  118. "vfmacc.vv v21, v13, v5 \n\t"\
  119. "fld ft1, (t1) \n\t"\
  120. "vfmacc.vv v22, v13, v6 \n\t"\
  121. "fld ft2, (t2) \n\t"\
  122. "vfmacc.vv v23, v13, v7 \n\t"\
  123. "fld ft3, (t3) \n\t"\
  124. "addi %[PB], %[PB], 4*8 \n\t"\
  125. "vfmacc.vv v24, v14, v4 \n\t"\
  126. "addi t1, t1, 4*8 \n\t"\
  127. "vfmacc.vv v25, v14, v5 \n\t"\
  128. "vfmv.v.f v8, ft0 \n\t"\
  129. "vfmacc.vv v26, v14, v6 \n\t"\
  130. "addi t2, t2, 4*8 \n\t"\
  131. "vfmacc.vv v27, v14, v7 \n\t"\
  132. "vfmv.v.f v9, ft1 \n\t"\
  133. "vfmacc.vv v28, v15, v4 \n\t"\
  134. "addi t3, t3, 4*8 \n\t"\
  135. "vfmacc.vv v29, v15, v5 \n\t"\
  136. "vfmv.v.f v10, ft2 \n\t"\
  137. "vfmacc.vv v30, v15, v6 \n\t"\
  138. "vfmacc.vv v31, v15, v7 \n\t"\
  139. "vfmv.v.f v11, ft3 \n\t"
  140. #define KERNEL8x4_E \
  141. "vfmacc.vv v16, v12, v4 \n\t"\
  142. "vfmacc.vv v17, v12, v5 \n\t"\
  143. "vfmacc.vv v18, v12, v6 \n\t"\
  144. "vfmacc.vv v19, v12, v7 \n\t"\
  145. "vfmacc.vv v20, v13, v4 \n\t"\
  146. "vfmacc.vv v21, v13, v5 \n\t"\
  147. "vfmacc.vv v22, v13, v6 \n\t"\
  148. "vfmacc.vv v23, v13, v7 \n\t"\
  149. "vfmacc.vv v24, v14, v4 \n\t"\
  150. "vfmacc.vv v25, v14, v5 \n\t"\
  151. "vfmacc.vv v26, v14, v6 \n\t"\
  152. "vfmacc.vv v27, v14, v7 \n\t"\
  153. "vfmacc.vv v28, v15, v4 \n\t"\
  154. "vfmacc.vv v29, v15, v5 \n\t"\
  155. "vfmacc.vv v30, v15, v6 \n\t"\
  156. "vfmacc.vv v31, v15, v7 \n\t"
  157. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  158. #ifdef TRMMKERNEL
  159. ,BLASLONG offset
  160. #endif
  161. )
  162. {
  163. BLASLONG i,j,k;
  164. FLOAT *C0,*C1,*C2,*C3;
  165. FLOAT *ptrba,*ptrbb;
  166. FLOAT loadb0,loadb1,loadb2,loadb3;
  167. FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
  168. FLOAT res0,res1,res2,res3;
  169. FLOAT res4,res5,res6,res7;
  170. FLOAT res8,res9,res10,res11;
  171. FLOAT res12,res13,res14,res15;
  172. for (j=0; j<bn/4; j+=1){
  173. C0 = C;
  174. C1 = C0+ldc;
  175. C2 = C1+ldc;
  176. C3 = C2+ldc;
  177. ptrba = ba;
  178. for(i=0; i<bm/8; i+=1){
  179. ptrbb = bb;
  180. //t0 for k
  181. //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
  182. //v0-v3,v4-v7 for A, t4-t6 for PA1-3
  183. //v16-v31 for temp C
  184. asm volatile(
  185. "vsetvli zero, zero, e64,m1 \n\t"
  186. "fmv.w.x ft11, zero \n\t"
  187. "mv t0, %[BK] \n\t"
  188. "vfmv.v.f v16, ft11 \n\t"
  189. "vfmv.v.f v17, ft11 \n\t"
  190. "vfmv.v.f v18, ft11 \n\t"
  191. "vfmv.v.f v19, ft11 \n\t"
  192. "vfmv.v.f v20, ft11 \n\t"
  193. "vfmv.v.f v21, ft11 \n\t"
  194. "vfmv.v.f v22, ft11 \n\t"
  195. "vfmv.v.f v23, ft11 \n\t"
  196. "vfmv.v.f v24, ft11 \n\t"
  197. "vfmv.v.f v25, ft11 \n\t"
  198. "vfmv.v.f v26, ft11 \n\t"
  199. "vfmv.v.f v27, ft11 \n\t"
  200. "vfmv.v.f v28, ft11 \n\t"
  201. "vfmv.v.f v29, ft11 \n\t"
  202. "vfmv.v.f v30, ft11 \n\t"
  203. "vfmv.v.f v31, ft11 \n\t"
  204. //unloop 8
  205. "srli t0, %[BK], 3 \n\t"
  206. "blez t0, M8x4_TAIL \n\t"
  207. //preloop
  208. KERNEL8x4_I
  209. KERNEL8x4_M2
  210. KERNEL8x4_M1
  211. KERNEL8x4_M2
  212. "addi t0, t0, -1 \n\t"
  213. "blez t0, M8x4_MAINLOOP_TAIL \n\t"
  214. ".align 4 \n\t"
  215. "M8x4_MAINLOOP: \n\t"
  216. KERNEL8x4_M1
  217. KERNEL8x4_M2
  218. KERNEL8x4_M1
  219. KERNEL8x4_M2
  220. KERNEL8x4_M1
  221. KERNEL8x4_M2
  222. KERNEL8x4_M1
  223. KERNEL8x4_M2
  224. "addi t0, t0, -1 \n\t"
  225. "bgtz t0, M8x4_MAINLOOP \n\t"
  226. "M8x4_MAINLOOP_TAIL: \n\t"
  227. KERNEL8x4_M1
  228. KERNEL8x4_M2
  229. KERNEL8x4_M1
  230. KERNEL8x4_E
  231. //tail
  232. "M8x4_TAIL: \n\t"
  233. "andi t0, %[BK], 7 \n\t"
  234. "blez t0, M8x4_SAVERESULT \n\t"
  235. "addi t4, %[PA], 2*8 \n\t"
  236. "addi t5, %[PA], 4*8 \n\t"
  237. "addi t6, %[PA], 6*8 \n\t"
  238. "addi t1, %[PB], 1*8 \n\t"
  239. "addi t2, %[PB], 2*8 \n\t"
  240. "addi t3, %[PB], 3*8 \n\t"
  241. ".align 4 \n\t"
  242. "M8x4_TAILLOOP: \n\t"
  243. "fld ft0, (%[PB]) \n\t"
  244. "addi %[PB], %[PB], 4*8 \n\t"
  245. "vle.v v0, (%[PA]) \n\t"
  246. "add %[PA], %[PA], 8*8 \n\t"
  247. "vle.v v1, (t4) \n\t"
  248. "addi t4, t4, 8*8 \n\t"
  249. "vfmv.v.f v8, ft0 \n\t"
  250. "fld ft1, (t1) \n\t"
  251. "addi t1, t1, 4*8 \n\t"
  252. "vle.v v2, (t5) \n\t"
  253. "addi t5, t5, 8*8 \n\t"
  254. "vle.v v3, (t6) \n\t"
  255. "addi t6, t6, 8*8 \n\t"
  256. "vfmacc.vv v16, v8, v0 \n\t"
  257. "fld ft2, (t2) \n\t"
  258. "addi t2, t2, 4*8 \n\t"
  259. "vfmacc.vv v17, v8, v1 \n\t"
  260. "vfmacc.vv v18, v8, v2 \n\t"
  261. "vfmv.v.f v9, ft1 \n\t"
  262. "vfmacc.vv v19, v8, v3 \n\t"
  263. "vfmacc.vv v20, v9, v0 \n\t"
  264. "fld ft3, (t3) \n\t"
  265. "addi t3, t3, 4*8 \n\t"
  266. "vfmacc.vv v21, v9, v1 \n\t"
  267. "vfmacc.vv v22, v9, v2 \n\t"
  268. "vfmv.v.f v10, ft2 \n\t"
  269. "vfmacc.vv v23, v9, v3 \n\t"
  270. "vfmv.v.f v11, ft3 \n\t"
  271. "vfmacc.vv v24, v10, v0 \n\t"
  272. "vfmacc.vv v25, v10, v1 \n\t"
  273. "vfmacc.vv v26, v10, v2 \n\t"
  274. "vfmacc.vv v27, v10, v3 \n\t"
  275. "vfmacc.vv v28, v11, v0 \n\t"
  276. "vfmacc.vv v29, v11, v1 \n\t"
  277. "vfmacc.vv v30, v11, v2 \n\t"
  278. "vfmacc.vv v31, v11, v3 \n\t"
  279. "addi t0, t0, -1 \n\t"
  280. "bgtz t0, M8x4_TAILLOOP \n\t"
  281. //Save result
  282. //load C
  283. "M8x4_SAVERESULT: \n\t"
  284. //use v8 to store alpha
  285. "vfmv.v.f v8, %[ALPHA] \n\t"
  286. "vle.v v0, (%[C0]) \n\t"
  287. "addi t4, %[C0], 2*8 \n\t"
  288. "vle.v v1, (%[C1]) \n\t"
  289. "addi t5, %[C1], 2*8 \n\t"
  290. "vle.v v2, (%[C2]) \n\t"
  291. "addi t6, %[C2], 2*8 \n\t"
  292. "vle.v v3, (%[C3]) \n\t"
  293. "addi t3, %[C3], 2*8 \n\t"
  294. //Multiply Alpha
  295. "vfmacc.vv v0, v8, v16 \n\t"
  296. "vle.v v4, (t4) \n\t"
  297. "vfmacc.vv v1, v8, v20 \n\t"
  298. "vle.v v5, (t5) \n\t"
  299. "vfmacc.vv v2, v8, v24 \n\t"
  300. "vle.v v6, (t6) \n\t"
  301. "vfmacc.vv v3, v8, v28 \n\t"
  302. "vle.v v7, (t3) \n\t"
  303. "vfmacc.vv v4, v8, v17 \n\t"
  304. "vse.v v0, (%[C0]) \n\t"
  305. "add %[C0], %[C0], 4*8 \n\t"
  306. "vfmacc.vv v5, v8, v21 \n\t"
  307. "vse.v v1, (%[C1]) \n\t"
  308. "add %[C1], %[C1], 4*8 \n\t"
  309. "vfmacc.vv v6, v8, v25 \n\t"
  310. "vse.v v2, (%[C2]) \n\t"
  311. "add %[C2], %[C2], 4*8 \n\t"
  312. "vfmacc.vv v7, v8, v29 \n\t"
  313. "vse.v v3, (%[C3]) \n\t"
  314. "add %[C3], %[C3], 4*8 \n\t"
  315. "vle.v v0, (%[C0]) \n\t"
  316. "vse.v v4, (t4) \n\t"
  317. "add t4, t4, 4*8 \n\t"
  318. "vle.v v1, (%[C1]) \n\t"
  319. "vse.v v5, (t5) \n\t"
  320. "add t5, t5, 4*8 \n\t"
  321. "vle.v v2, (%[C2]) \n\t"
  322. "vse.v v6, (t6) \n\t"
  323. "add t6, t6, 4*8 \n\t"
  324. "vle.v v3, (%[C3]) \n\t"
  325. "vse.v v7, (t3) \n\t"
  326. "add t3, t3, 4*8 \n\t"
  327. "vfmacc.vv v0, v8, v18 \n\t"
  328. "vle.v v4, (t4) \n\t"
  329. "vfmacc.vv v1, v8, v22 \n\t"
  330. "vle.v v5, (t5) \n\t"
  331. "vfmacc.vv v2, v8, v26 \n\t"
  332. "vle.v v6, (t6) \n\t"
  333. "vfmacc.vv v3, v8, v30 \n\t"
  334. "vle.v v7, (t3) \n\t"
  335. "vfmacc.vv v4, v8, v19 \n\t"
  336. "vse.v v0, (%[C0]) \n\t"
  337. "add %[C0], %[C0], 4*8 \n\t"
  338. "vfmacc.vv v5, v8, v23 \n\t"
  339. "vse.v v1, (%[C1]) \n\t"
  340. "add %[C1], %[C1], 4*8 \n\t"
  341. "vfmacc.vv v6, v8, v27 \n\t"
  342. "vse.v v2, (%[C2]) \n\t"
  343. "add %[C2], %[C2], 4*8 \n\t"
  344. "vfmacc.vv v7, v8, v31 \n\t"
  345. "vse.v v3, (%[C3]) \n\t"
  346. "add %[C3], %[C3], 4*8 \n\t"
  347. "vse.v v4, (t4) \n\t"
  348. "vse.v v5, (t5) \n\t"
  349. "vse.v v6, (t6) \n\t"
  350. "vse.v v7, (t3) \n\t"
  351. "M8x4_END: \n\t"
  352. :[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
  353. [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
  354. :[ALPHA]"f"(alpha), [BK]"r"(bk)
  355. :"cc", "t0", "t4","t5","t6","t3","t1","t2",
  356. "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
  357. "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
  358. "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
  359. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
  360. "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
  361. }
  362. if(bm&4){
  363. ptrbb = bb;
  364. res0 = 0;
  365. res1 = 0;
  366. res2 = 0;
  367. res3 = 0;
  368. res4 = 0;
  369. res5 = 0;
  370. res6 = 0;
  371. res7 = 0;
  372. res8 = 0;
  373. res9 = 0;
  374. res10 = 0;
  375. res11 = 0;
  376. res12 = 0;
  377. res13 = 0;
  378. res14 = 0;
  379. res15 = 0;
  380. for(k=0; k<bk; k+=1){
  381. loadb0 = ptrbb[0];
  382. loadb1 = ptrbb[1];
  383. load0 = ptrba[0];
  384. load1 = ptrba[1];
  385. load2 = ptrba[2];
  386. load3 = ptrba[3];
  387. res0 = res0 + load0 * loadb0;
  388. res1 = res1 + load1 * loadb0;
  389. res2 = res2 + load2 * loadb0;
  390. res3 = res3 + load3 * loadb0;
  391. res4 = res4 + load0 * loadb1;
  392. res5 = res5 + load1 * loadb1;
  393. res6 = res6 + load2 * loadb1;
  394. res7 = res7 + load3 * loadb1;
  395. loadb2 = ptrbb[2];
  396. loadb3 = ptrbb[3];
  397. res8 = res8 + load0 * loadb2;
  398. res9 = res9 + load1 * loadb2;
  399. res10 = res10 + load2 * loadb2;
  400. res11 = res11 + load3 * loadb2;
  401. res12 = res12 + load0 * loadb3;
  402. res13 = res13 + load1 * loadb3;
  403. res14 = res14 + load2 * loadb3;
  404. res15 = res15 + load3 * loadb3;
  405. ptrba += 4;
  406. ptrbb += 4;
  407. }
  408. res0 = res0 * alpha;
  409. res1 = res1 * alpha;
  410. res2 = res2 * alpha;
  411. res3 = res3 * alpha;
  412. res4 = res4 * alpha;
  413. res5 = res5 * alpha;
  414. res6 = res6 * alpha;
  415. res7 = res7 * alpha;
  416. res8 = res8 * alpha;
  417. res9 = res9 * alpha;
  418. res10 = res10 * alpha;
  419. res11 = res11 * alpha;
  420. res12 = res12 * alpha;
  421. res13 = res13 * alpha;
  422. res14 = res14 * alpha;
  423. res15 = res15 * alpha;
  424. C0[0] += res0;
  425. C0[1] += res1;
  426. C0[2] += res2;
  427. C0[3] += res3;
  428. C1[0] += res4;
  429. C1[1] += res5;
  430. C1[2] += res6;
  431. C1[3] += res7;
  432. C2[0] += res8;
  433. C2[1] += res9;
  434. C2[2] += res10;
  435. C2[3] += res11;
  436. C3[0] += res12;
  437. C3[1] += res13;
  438. C3[2] += res14;
  439. C3[3] += res15;
  440. C0 += 4;
  441. C1 += 4;
  442. C2 += 4;
  443. C3 += 4;
  444. }
  445. if(bm&2){
  446. ptrbb = bb;
  447. res0 = 0;
  448. res1 = 0;
  449. res4 = 0;
  450. res5 = 0;
  451. res8 = 0;
  452. res9 = 0;
  453. res12 = 0;
  454. res13 = 0;
  455. for(k=0; k<bk; k+=1){
  456. loadb0 = ptrbb[0];
  457. loadb1 = ptrbb[1];
  458. load0 = ptrba[0];
  459. load1 = ptrba[1];
  460. res0 = res0 + load0 * loadb0;
  461. res1 = res1 + load1 * loadb0;
  462. res4 = res4 + load0 * loadb1;
  463. res5 = res5 + load1 * loadb1;
  464. loadb2 = ptrbb[2];
  465. loadb3 = ptrbb[3];
  466. res8 = res8 + load0 * loadb2;
  467. res9 = res9 + load1 * loadb2;
  468. res12 = res12 + load0 * loadb3;
  469. res13 = res13 + load1 * loadb3;
  470. ptrba += 2;
  471. ptrbb += 4;
  472. }
  473. res0 = res0 * alpha;
  474. res1 = res1 * alpha;
  475. res4 = res4 * alpha;
  476. res5 = res5 * alpha;
  477. res8 = res8 * alpha;
  478. res9 = res9 * alpha;
  479. res12 = res12 * alpha;
  480. res13 = res13 * alpha;
  481. C0[0] += res0;
  482. C0[1] += res1;
  483. C1[0] += res4;
  484. C1[1] += res5;
  485. C2[0] += res8;
  486. C2[1] += res9;
  487. C3[0] += res12;
  488. C3[1] += res13;
  489. C0 += 2;
  490. C1 += 2;
  491. C2 += 2;
  492. C3 += 2;
  493. }
  494. if(bm&1){
  495. ptrbb = bb;
  496. res0 = 0;
  497. res4 = 0;
  498. res8 = 0;
  499. res12 = 0;
  500. for(k=0; k<bk; k+=1){
  501. loadb0 = ptrbb[0];
  502. loadb1 = ptrbb[1];
  503. load0 = ptrba[0];
  504. res0 = res0 + load0 * loadb0;
  505. res4 = res4 + load0 * loadb1;
  506. loadb2 = ptrbb[2];
  507. loadb3 = ptrbb[3];
  508. res8 = res8 + load0 * loadb2;
  509. res12 = res12 + load0 * loadb3;
  510. ptrba += 1;
  511. ptrbb += 4;
  512. }
  513. res0 = res0 * alpha;
  514. res4 = res4 * alpha;
  515. res8 = res8 * alpha;
  516. res12 = res12 * alpha;
  517. C0[0] += res0;
  518. C1[0] += res4;
  519. C2[0] += res8;
  520. C3[0] += res12;
  521. C0 += 1;
  522. C1 += 1;
  523. C2 += 1;
  524. C3 += 1;
  525. }
  526. k = bk<<2;
  527. bb = bb+k;
  528. i = ldc<<2;
  529. C = C+i;
  530. }
  531. if(bn&2){
  532. C0 = C;
  533. C1 = C0+ldc;
  534. ptrba = ba;
  535. for(i=0; i<bm/8; i+=1){
  536. ptrbb = bb;
  537. res0 = 0;
  538. res1 = 0;
  539. res2 = 0;
  540. res3 = 0;
  541. res4 = 0;
  542. res5 = 0;
  543. res6 = 0;
  544. res7 = 0;
  545. res8 = 0;
  546. res9 = 0;
  547. res10 = 0;
  548. res11 = 0;
  549. res12 = 0;
  550. res13 = 0;
  551. res14 = 0;
  552. res15 = 0;
  553. for(k=0; k<bk; k+=1){
  554. loadb0 = ptrbb[0];
  555. loadb1 = ptrbb[1];
  556. load0 = ptrba[0];
  557. load1 = ptrba[1];
  558. load2 = ptrba[2];
  559. load3 = ptrba[3];
  560. load4 = ptrba[4];
  561. load5 = ptrba[5];
  562. load6 = ptrba[6];
  563. load7 = ptrba[7];
  564. res0 = res0 + load0 * loadb0;
  565. res1 = res1 + load1 * loadb0;
  566. res2 = res2 + load2 * loadb0;
  567. res3 = res3 + load3 * loadb0;
  568. res4 = res4 + load4 * loadb0;
  569. res5 = res5 + load5 * loadb0;
  570. res6 = res6 + load6 * loadb0;
  571. res7 = res7 + load7 * loadb0;
  572. res8 = res8 + load0 * loadb1;
  573. res9 = res9 + load1 * loadb1;
  574. res10 = res10 + load2 * loadb1;
  575. res11 = res11 + load3 * loadb1;
  576. res12 = res12 + load4 * loadb1;
  577. res13 = res13 + load5 * loadb1;
  578. res14 = res14 + load6 * loadb1;
  579. res15 = res15 + load7 * loadb1;
  580. ptrba += 8;
  581. ptrbb += 2;
  582. }
  583. res0 = res0 * alpha;
  584. res1 = res1 * alpha;
  585. res2 = res2 * alpha;
  586. res3 = res3 * alpha;
  587. res4 = res4 * alpha;
  588. res5 = res5 * alpha;
  589. res6 = res6 * alpha;
  590. res7 = res7 * alpha;
  591. res8 = res8 * alpha;
  592. res9 = res9 * alpha;
  593. res10 = res10 * alpha;
  594. res11 = res11 * alpha;
  595. res12 = res12 * alpha;
  596. res13 = res13 * alpha;
  597. res14 = res14 * alpha;
  598. res15 = res15 * alpha;
  599. C0[0] += res0;
  600. C0[1] += res1;
  601. C0[2] += res2;
  602. C0[3] += res3;
  603. C0[4] += res4;
  604. C0[5] += res5;
  605. C0[6] += res6;
  606. C0[7] += res7;
  607. C1[0] += res8;
  608. C1[1] += res9;
  609. C1[2] += res10;
  610. C1[3] += res11;
  611. C1[4] += res12;
  612. C1[5] += res13;
  613. C1[6] += res14;
  614. C1[7] += res15;
  615. C0 += 8;
  616. C1 += 8;
  617. }
  618. if(bm&4){
  619. ptrbb = bb;
  620. res0 = 0;
  621. res1 = 0;
  622. res2 = 0;
  623. res3 = 0;
  624. res8 = 0;
  625. res9 = 0;
  626. res10 = 0;
  627. res11 = 0;
  628. for(k=0; k<bk; k+=1){
  629. loadb0 = ptrbb[0];
  630. loadb1 = ptrbb[1];
  631. load0 = ptrba[0];
  632. load1 = ptrba[1];
  633. load2 = ptrba[2];
  634. load3 = ptrba[3];
  635. res0 = res0 + load0 * loadb0;
  636. res1 = res1 + load1 * loadb0;
  637. res2 = res2 + load2 * loadb0;
  638. res3 = res3 + load3 * loadb0;
  639. res8 = res8 + load0 * loadb1;
  640. res9 = res9 + load1 * loadb1;
  641. res10 = res10 + load2 * loadb1;
  642. res11 = res11 + load3 * loadb1;
  643. ptrba += 4;
  644. ptrbb += 2;
  645. }
  646. res0 = res0 * alpha;
  647. res1 = res1 * alpha;
  648. res2 = res2 * alpha;
  649. res3 = res3 * alpha;
  650. res8 = res8 * alpha;
  651. res9 = res9 * alpha;
  652. res10 = res10 * alpha;
  653. res11 = res11 * alpha;
  654. C0[0] += res0;
  655. C0[1] += res1;
  656. C0[2] += res2;
  657. C0[3] += res3;
  658. C1[0] += res8;
  659. C1[1] += res9;
  660. C1[2] += res10;
  661. C1[3] += res11;
  662. C0 += 4;
  663. C1 += 4;
  664. }
  665. if(bm&2){
  666. ptrbb = bb;
  667. res0 = 0;
  668. res1 = 0;
  669. res8 = 0;
  670. res9 = 0;
  671. for(k=0; k<bk; k+=1){
  672. loadb0 = ptrbb[0];
  673. loadb1 = ptrbb[1];
  674. load0 = ptrba[0];
  675. load1 = ptrba[1];
  676. res0 = res0 + load0 * loadb0;
  677. res1 = res1 + load1 * loadb0;
  678. res8 = res8 + load0 * loadb1;
  679. res9 = res9 + load1 * loadb1;
  680. ptrba += 2;
  681. ptrbb += 2;
  682. }
  683. res0 = res0 * alpha;
  684. res1 = res1 * alpha;
  685. res8 = res8 * alpha;
  686. res9 = res9 * alpha;
  687. C0[0] += res0;
  688. C0[1] += res1;
  689. C1[0] += res8;
  690. C1[1] += res9;
  691. C0 += 2;
  692. C1 += 2;
  693. }
  694. if(bm&1){
  695. ptrbb = bb;
  696. res0 = 0;
  697. res8 = 0;
  698. for(k=0; k<bk; k+=1){
  699. loadb0 = ptrbb[0];
  700. loadb1 = ptrbb[1];
  701. load0 = ptrba[0];
  702. res0 = res0 + load0 * loadb0;
  703. res8 = res8 + load0 * loadb1;
  704. ptrba += 1;
  705. ptrbb += 2;
  706. }
  707. res0 = res0 * alpha;
  708. res8 = res8 * alpha;
  709. C0[0] += res0;
  710. C1[0] += res8;
  711. C0 += 1;
  712. C1 += 1;
  713. }
  714. k = bk<<1;
  715. bb = bb+k;
  716. i = ldc<<1;
  717. C = C+i;
  718. }
  719. if (bn&1){
  720. C0 = C;
  721. ptrba = ba;
  722. for(i=0; i<bm/8; i+=1){
  723. ptrbb = bb;
  724. res0 = 0;
  725. res1 = 0;
  726. res2 = 0;
  727. res3 = 0;
  728. res4 = 0;
  729. res5 = 0;
  730. res6 = 0;
  731. res7 = 0;
  732. for(k=0; k<bk; k+=1){
  733. loadb0 = ptrbb[0];
  734. res0 = res0 + ptrba[0] * loadb0;
  735. res1 = res1 + ptrba[1] * loadb0;
  736. res2 = res2 + ptrba[2] * loadb0;
  737. res3 = res3 + ptrba[3] * loadb0;
  738. res4 = res4 + ptrba[4] * loadb0;
  739. res5 = res5 + ptrba[5] * loadb0;
  740. res6 = res6 + ptrba[6] * loadb0;
  741. res7 = res7 + ptrba[7] * loadb0;
  742. ptrba += 8;
  743. ptrbb += 1;
  744. }
  745. res0 = res0 * alpha;
  746. res1 = res1 * alpha;
  747. res2 = res2 * alpha;
  748. res3 = res3 * alpha;
  749. res4 = res4 * alpha;
  750. res5 = res5 * alpha;
  751. res6 = res6 * alpha;
  752. res7 = res7 * alpha;
  753. C0[0] += res0;
  754. C0[1] += res1;
  755. C0[2] += res2;
  756. C0[3] += res3;
  757. C0[4] += res4;
  758. C0[5] += res5;
  759. C0[6] += res6;
  760. C0[7] += res7;
  761. C0 += 8;
  762. }
  763. if(bm&4){
  764. ptrbb = bb;
  765. res0 = 0;
  766. res1 = 0;
  767. res2 = 0;
  768. res3 = 0;
  769. for(k=0; k<bk; k+=1){
  770. loadb0 = ptrbb[0];
  771. res0 = res0 + ptrba[0] * loadb0;
  772. res1 = res1 + ptrba[1] * loadb0;
  773. res2 = res2 + ptrba[2] * loadb0;
  774. res3 = res3 + ptrba[3] * loadb0;
  775. ptrba += 4;
  776. ptrbb += 1;
  777. }
  778. res0 = res0 * alpha;
  779. res1 = res1 * alpha;
  780. res2 = res2 * alpha;
  781. res3 = res3 * alpha;
  782. C0[0] += res0;
  783. C0[1] += res1;
  784. C0[2] += res2;
  785. C0[3] += res3;
  786. C0 += 4;
  787. }
  788. if(bm&2){
  789. ptrbb = bb;
  790. res0 = 0;
  791. res1 = 0;
  792. for(k=0; k<bk; k+=1){
  793. loadb0 = ptrbb[0];
  794. res0 = res0 + ptrba[0] * loadb0;
  795. res1 = res1 + ptrba[1] * loadb0;
  796. ptrba += 2;
  797. ptrbb += 1;
  798. }
  799. res0 = res0 * alpha;
  800. res1 = res1 * alpha;
  801. C0[0] += res0;
  802. C0[1] += res1;
  803. C0 += 2;
  804. }
  805. if(bm&1){
  806. ptrbb = bb;
  807. res0 = 0;
  808. for(k=0; k<bk; k+=1){
  809. loadb0 = ptrbb[0];
  810. res0 = res0 + ptrba[0] * loadb0;
  811. ptrba += 1;
  812. ptrbb += 1;
  813. }
  814. res0 = res0 * alpha;
  815. C0[0] += res0;
  816. C0 += 1;
  817. }
  818. k = bk;
  819. bb = bb+k;
  820. C = C+ldc;
  821. }
  822. return 0;
  823. }