You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_16x4_c910v.c 46 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632
  1. #include "common.h"
  2. #include <riscv_vector.h>
  3. #define KERNEL16x4_I \
  4. "addi t1, %[PB], 1*4 \n\t"\
  5. "addi t2, %[PB], 2*4 \n\t"\
  6. "addi t3, %[PB], 3*4 \n\t"\
  7. "flw ft0, (%[PB]) \n\t"\
  8. "flw ft1, (t1) \n\t"\
  9. "flw ft2, (t2) \n\t"\
  10. "flw ft3, (t3) \n\t"\
  11. "vle.v v0, (%[PA]) \n\t"\
  12. "addi t4, %[PA], 4*4 \n\t"\
  13. "addi t5, %[PA], 8*4 \n\t"\
  14. "vfmv.v.f v8, ft0 \n\t"\
  15. "addi t6, %[PA], 12*4 \n\t"\
  16. "addi %[PA], %[PA], 16*4 \n\t"\
  17. "vle.v v1, (t4) \n\t"\
  18. "addi t4, t4, 16*4 \n\t"\
  19. "vfmv.v.f v9, ft1 \n\t"\
  20. "vle.v v2, (t5) \n\t"\
  21. "addi t5, t5, 16*4 \n\t"\
  22. "vle.v v3, (t6) \n\t"\
  23. "addi t6, t6, 16*4 \n\t"\
  24. "vfmv.v.f v10, ft2 \n\t"\
  25. "addi %[PB], %[PB], 4*4 \n\t"\
  26. "vle.v v4, (%[PA]) \n\t"\
  27. "addi %[PA], %[PA], 16*4 \n\t"\
  28. "vfmv.v.f v11, ft3 \n\t"\
  29. "vfmacc.vv v16, v8, v0 \n\t"\
  30. "addi t1, t1, 4*4 \n\t"\
  31. "vle.v v5, (t4) \n\t"\
  32. "addi t4, t4, 16*4 \n\t"\
  33. "vfmacc.vv v17, v8, v1 \n\t"\
  34. "addi t2, t2, 4*4 \n\t"\
  35. "vle.v v6, (t5) \n\t"\
  36. "addi t5, t5, 16*4 \n\t"\
  37. "vfmacc.vv v18, v8, v2 \n\t"\
  38. "addi t3, t3, 4*4 \n\t"\
  39. "vle.v v7, (t6) \n\t"\
  40. "addi t6, t6, 16*4 \n\t"\
  41. "vfmacc.vv v19, v8, v3 \n\t"\
  42. "flw ft4, (%[PB]) \n\t"\
  43. "vfmacc.vv v20, v9, v0 \n\t"\
  44. "flw ft5, (t1) \n\t"\
  45. "vfmacc.vv v21, v9, v1 \n\t"\
  46. "flw ft6, (t2) \n\t"\
  47. "vfmacc.vv v22, v9, v2 \n\t"\
  48. "flw ft7, (t3) \n\t"\
  49. "vfmacc.vv v23, v9, v3 \n\t"\
  50. "vfmv.v.f v12, ft4 \n\t"\
  51. "vfmacc.vv v24, v10, v0 \n\t"\
  52. "vfmv.v.f v13, ft5 \n\t"\
  53. "vfmacc.vv v25, v10, v1 \n\t"\
  54. "vfmv.v.f v14, ft6 \n\t"\
  55. "vfmacc.vv v26, v10, v2 \n\t"\
  56. "vfmv.v.f v15, ft7 \n\t"\
  57. "vfmacc.vv v27, v10, v3 \n\t"\
  58. "addi %[PB], %[PB], 4*4 \n\t"\
  59. "vfmacc.vv v28, v11, v0 \n\t"\
  60. "addi t1, t1, 4*4 \n\t"\
  61. "vfmacc.vv v29, v11, v1 \n\t"\
  62. "addi t2, t2, 4*4 \n\t"\
  63. "vfmacc.vv v30, v11, v2 \n\t"\
  64. "addi t3, t3, 4*4 \n\t"\
  65. "vfmacc.vv v31, v11, v3 \n\t"
  66. #define KERNEL16x4_M1 \
  67. "vfmacc.vv v16, v8, v0 \n\t"\
  68. "vle.v v4, (%[PA]) \n\t"\
  69. "addi %[PA], %[PA], 16*4 \n\t"\
  70. "vfmacc.vv v17, v8, v1 \n\t"\
  71. "vle.v v5, (t4) \n\t"\
  72. "addi t4, t4, 16*4 \n\t"\
  73. "vfmacc.vv v18, v8, v2 \n\t"\
  74. "vle.v v6, (t5) \n\t"\
  75. "addi t5, t5, 16*4 \n\t"\
  76. "vfmacc.vv v19, v8, v3 \n\t"\
  77. "vle.v v7, (t6) \n\t"\
  78. "addi t6, t6, 16*4 \n\t"\
  79. "vfmacc.vv v20, v9, v0 \n\t"\
  80. "flw ft4, (%[PB]) \n\t"\
  81. "vfmacc.vv v21, v9, v1 \n\t"\
  82. "flw ft5, (t1) \n\t"\
  83. "vfmacc.vv v22, v9, v2 \n\t"\
  84. "flw ft6, (t2) \n\t"\
  85. "vfmacc.vv v23, v9, v3 \n\t"\
  86. "flw ft7, (t3) \n\t"\
  87. "addi %[PB], %[PB], 4*4 \n\t"\
  88. "vfmacc.vv v24, v10, v0 \n\t"\
  89. "addi t1, t1, 4*4 \n\t"\
  90. "vfmacc.vv v25, v10, v1 \n\t"\
  91. "vfmv.v.f v12, ft4 \n\t"\
  92. "vfmacc.vv v26, v10, v2 \n\t"\
  93. "addi t2, t2, 4*4 \n\t"\
  94. "vfmacc.vv v27, v10, v3 \n\t"\
  95. "vfmv.v.f v13, ft5 \n\t"\
  96. "vfmacc.vv v28, v11, v0 \n\t"\
  97. "addi t3, t3, 4*4 \n\t"\
  98. "vfmacc.vv v29, v11, v1 \n\t"\
  99. "vfmv.v.f v14, ft6 \n\t"\
  100. "vfmacc.vv v30, v11, v2 \n\t"\
  101. "vfmacc.vv v31, v11, v3 \n\t"\
  102. "vfmv.v.f v15, ft7 \n\t"
  103. #define KERNEL16x4_M2 \
  104. "vfmacc.vv v16, v12, v4 \n\t"\
  105. "vle.v v0, (%[PA]) \n\t"\
  106. "addi %[PA], %[PA], 16*4 \n\t"\
  107. "vfmacc.vv v17, v12, v5 \n\t"\
  108. "vle.v v1, (t4) \n\t"\
  109. "addi t4, t4, 16*4 \n\t"\
  110. "vfmacc.vv v18, v12, v6 \n\t"\
  111. "vle.v v2, (t5) \n\t"\
  112. "addi t5, t5, 16*4 \n\t"\
  113. "vfmacc.vv v19, v12, v7 \n\t"\
  114. "vle.v v3, (t6) \n\t"\
  115. "addi t6, t6, 16*4 \n\t"\
  116. "vfmacc.vv v20, v13, v4 \n\t"\
  117. "flw ft0, (%[PB]) \n\t"\
  118. "vfmacc.vv v21, v13, v5 \n\t"\
  119. "flw ft1, (t1) \n\t"\
  120. "vfmacc.vv v22, v13, v6 \n\t"\
  121. "flw ft2, (t2) \n\t"\
  122. "vfmacc.vv v23, v13, v7 \n\t"\
  123. "flw ft3, (t3) \n\t"\
  124. "addi %[PB], %[PB], 4*4 \n\t"\
  125. "vfmacc.vv v24, v14, v4 \n\t"\
  126. "addi t1, t1, 4*4 \n\t"\
  127. "vfmacc.vv v25, v14, v5 \n\t"\
  128. "vfmv.v.f v8, ft0 \n\t"\
  129. "vfmacc.vv v26, v14, v6 \n\t"\
  130. "addi t2, t2, 4*4 \n\t"\
  131. "vfmacc.vv v27, v14, v7 \n\t"\
  132. "vfmv.v.f v9, ft1 \n\t"\
  133. "vfmacc.vv v28, v15, v4 \n\t"\
  134. "addi t3, t3, 4*4 \n\t"\
  135. "vfmacc.vv v29, v15, v5 \n\t"\
  136. "vfmv.v.f v10, ft2 \n\t"\
  137. "vfmacc.vv v30, v15, v6 \n\t"\
  138. "vfmacc.vv v31, v15, v7 \n\t"\
  139. "vfmv.v.f v11, ft3 \n\t"
  140. #define KERNEL16x4_E \
  141. "vfmacc.vv v16, v12, v4 \n\t"\
  142. "vfmacc.vv v17, v12, v5 \n\t"\
  143. "vfmacc.vv v18, v12, v6 \n\t"\
  144. "vfmacc.vv v19, v12, v7 \n\t"\
  145. "vfmacc.vv v20, v13, v4 \n\t"\
  146. "vfmacc.vv v21, v13, v5 \n\t"\
  147. "vfmacc.vv v22, v13, v6 \n\t"\
  148. "vfmacc.vv v23, v13, v7 \n\t"\
  149. "vfmacc.vv v24, v14, v4 \n\t"\
  150. "vfmacc.vv v25, v14, v5 \n\t"\
  151. "vfmacc.vv v26, v14, v6 \n\t"\
  152. "vfmacc.vv v27, v14, v7 \n\t"\
  153. "vfmacc.vv v28, v15, v4 \n\t"\
  154. "vfmacc.vv v29, v15, v5 \n\t"\
  155. "vfmacc.vv v30, v15, v6 \n\t"\
  156. "vfmacc.vv v31, v15, v7 \n\t"
  157. #define KERNEL8x4_I \
  158. "addi t1, %[PB], 1*4 \n\t"\
  159. "addi t2, %[PB], 2*4 \n\t"\
  160. "addi t3, %[PB], 3*4 \n\t"\
  161. "flw ft0, (%[PB]) \n\t"\
  162. "flw ft1, (t1) \n\t"\
  163. "flw ft2, (t2) \n\t"\
  164. "flw ft3, (t3) \n\t"\
  165. "vle.v v0, (%[PA]) \n\t"\
  166. "addi t4, %[PA], 4*4 \n\t"\
  167. "vfmv.v.f v8, ft0 \n\t"\
  168. "addi %[PA], %[PA], 8*4 \n\t"\
  169. "vle.v v1, (t4) \n\t"\
  170. "addi t4, t4, 8*4 \n\t"\
  171. "vfmv.v.f v9, ft1 \n\t"\
  172. "vfmv.v.f v10, ft2 \n\t"\
  173. "addi %[PB], %[PB], 4*4 \n\t"\
  174. "vle.v v4, (%[PA]) \n\t"\
  175. "addi %[PA], %[PA], 8*4 \n\t"\
  176. "vfmv.v.f v11, ft3 \n\t"\
  177. "vfmacc.vv v16, v8, v0 \n\t"\
  178. "addi t1, t1, 4*4 \n\t"\
  179. "vle.v v5, (t4) \n\t"\
  180. "addi t4, t4, 8*4 \n\t"\
  181. "vfmacc.vv v17, v8, v1 \n\t"\
  182. "addi t2, t2, 4*4 \n\t"\
  183. "flw ft4, (%[PB]) \n\t"\
  184. "addi t3, t3, 4*4 \n\t"\
  185. "vfmacc.vv v20, v9, v0 \n\t"\
  186. "flw ft5, (t1) \n\t"\
  187. "vfmacc.vv v21, v9, v1 \n\t"\
  188. "flw ft6, (t2) \n\t"\
  189. "vfmv.v.f v12, ft4 \n\t"\
  190. "flw ft7, (t3) \n\t"\
  191. "vfmacc.vv v24, v10, v0 \n\t"\
  192. "vfmv.v.f v13, ft5 \n\t"\
  193. "vfmacc.vv v25, v10, v1 \n\t"\
  194. "vfmv.v.f v14, ft6 \n\t"\
  195. "addi %[PB], %[PB], 4*4 \n\t"\
  196. "vfmv.v.f v15, ft7 \n\t"\
  197. "addi t1, t1, 4*4 \n\t"\
  198. "vfmacc.vv v28, v11, v0 \n\t"\
  199. "addi t2, t2, 4*4 \n\t"\
  200. "vfmacc.vv v29, v11, v1 \n\t"\
  201. "addi t3, t3, 4*4 \n\t"
  202. #define KERNEL8x4_M1 \
  203. "vfmacc.vv v16, v8, v0 \n\t"\
  204. "vle.v v4, (%[PA]) \n\t"\
  205. "addi %[PA], %[PA], 8*4 \n\t"\
  206. "vfmacc.vv v17, v8, v1 \n\t"\
  207. "vle.v v5, (t4) \n\t"\
  208. "addi t4, t4, 8*4 \n\t"\
  209. "vfmacc.vv v20, v9, v0 \n\t"\
  210. "flw ft4, (%[PB]) \n\t"\
  211. "vfmacc.vv v21, v9, v1 \n\t"\
  212. "flw ft5, (t1) \n\t"\
  213. "addi %[PB], %[PB], 4*4 \n\t"\
  214. "flw ft6, (t2) \n\t"\
  215. "vfmacc.vv v24, v10, v0 \n\t"\
  216. "flw ft7, (t3) \n\t"\
  217. "addi t1, t1, 4*4 \n\t"\
  218. "vfmacc.vv v25, v10, v1 \n\t"\
  219. "vfmv.v.f v12, ft4 \n\t"\
  220. "addi t2, t2, 4*4 \n\t"\
  221. "vfmv.v.f v13, ft5 \n\t"\
  222. "vfmacc.vv v28, v11, v0 \n\t"\
  223. "addi t3, t3, 4*4 \n\t"\
  224. "vfmacc.vv v29, v11, v1 \n\t"\
  225. "vfmv.v.f v14, ft6 \n\t"\
  226. "vfmv.v.f v15, ft7 \n\t"
  227. #define KERNEL8x4_M2 \
  228. "vfmacc.vv v16, v12, v4 \n\t"\
  229. "vle.v v0, (%[PA]) \n\t"\
  230. "addi %[PA], %[PA], 8*4 \n\t"\
  231. "vfmacc.vv v17, v12, v5 \n\t"\
  232. "vle.v v1, (t4) \n\t"\
  233. "addi t4, t4, 8*4 \n\t"\
  234. "vfmacc.vv v20, v13, v4 \n\t"\
  235. "flw ft0, (%[PB]) \n\t"\
  236. "vfmacc.vv v21, v13, v5 \n\t"\
  237. "flw ft1, (t1) \n\t"\
  238. "addi %[PB], %[PB], 4*4 \n\t"\
  239. "flw ft2, (t2) \n\t"\
  240. "vfmacc.vv v24, v14, v4 \n\t"\
  241. "flw ft3, (t3) \n\t"\
  242. "addi t1, t1, 4*4 \n\t"\
  243. "vfmacc.vv v25, v14, v5 \n\t"\
  244. "vfmv.v.f v8, ft0 \n\t"\
  245. "addi t2, t2, 4*4 \n\t"\
  246. "vfmv.v.f v9, ft1 \n\t"\
  247. "vfmacc.vv v28, v15, v4 \n\t"\
  248. "addi t3, t3, 4*4 \n\t"\
  249. "vfmacc.vv v29, v15, v5 \n\t"\
  250. "vfmv.v.f v10, ft2 \n\t"\
  251. "vfmv.v.f v11, ft3 \n\t"
  252. #define KERNEL8x4_E \
  253. "vfmacc.vv v16, v12, v4 \n\t"\
  254. "vfmacc.vv v17, v12, v5 \n\t"\
  255. "vfmacc.vv v20, v13, v4 \n\t"\
  256. "vfmacc.vv v21, v13, v5 \n\t"\
  257. "vfmacc.vv v24, v14, v4 \n\t"\
  258. "vfmacc.vv v25, v14, v5 \n\t"\
  259. "vfmacc.vv v28, v15, v4 \n\t"\
  260. "vfmacc.vv v29, v15, v5 \n\t"
  261. #define KERNEL16x2_I \
  262. "addi t1, %[PB], 1*4 \n\t"\
  263. "flw ft0, (%[PB]) \n\t"\
  264. "flw ft1, (t1) \n\t"\
  265. "vle.v v0, (%[PA]) \n\t"\
  266. "addi t4, %[PA], 4*4 \n\t"\
  267. "addi t5, %[PA], 8*4 \n\t"\
  268. "vfmv.v.f v8, ft0 \n\t"\
  269. "addi t6, %[PA], 12*4 \n\t"\
  270. "addi %[PA], %[PA], 16*4 \n\t"\
  271. "vle.v v1, (t4) \n\t"\
  272. "addi t4, t4, 16*4 \n\t"\
  273. "vfmv.v.f v9, ft1 \n\t"\
  274. "vle.v v2, (t5) \n\t"\
  275. "addi t5, t5, 16*4 \n\t"\
  276. "vle.v v3, (t6) \n\t"\
  277. "addi t6, t6, 16*4 \n\t"\
  278. "addi %[PB], %[PB], 2*4 \n\t"\
  279. "vle.v v4, (%[PA]) \n\t"\
  280. "addi %[PA], %[PA], 16*4 \n\t"\
  281. "vfmacc.vv v16, v8, v0 \n\t"\
  282. "addi t1, t1, 2*4 \n\t"\
  283. "vle.v v5, (t4) \n\t"\
  284. "addi t4, t4, 16*4 \n\t"\
  285. "vfmacc.vv v17, v8, v1 \n\t"\
  286. "vle.v v6, (t5) \n\t"\
  287. "addi t5, t5, 16*4 \n\t"\
  288. "vfmacc.vv v18, v8, v2 \n\t"\
  289. "vle.v v7, (t6) \n\t"\
  290. "addi t6, t6, 16*4 \n\t"\
  291. "vfmacc.vv v19, v8, v3 \n\t"\
  292. "flw ft4, (%[PB]) \n\t"\
  293. "vfmacc.vv v20, v9, v0 \n\t"\
  294. "flw ft5, (t1) \n\t"\
  295. "vfmacc.vv v21, v9, v1 \n\t"\
  296. "addi %[PB], %[PB], 2*4 \n\t"\
  297. "vfmacc.vv v22, v9, v2 \n\t"\
  298. "addi t1, t1, 2*4 \n\t"\
  299. "vfmacc.vv v23, v9, v3 \n\t"\
  300. "vfmv.v.f v12, ft4 \n\t"\
  301. "vfmv.v.f v13, ft5 \n\t"
  302. #define KERNEL16x2_M1 \
  303. "vfmacc.vv v16, v8, v0 \n\t"\
  304. "vle.v v4, (%[PA]) \n\t"\
  305. "addi %[PA], %[PA], 16*4 \n\t"\
  306. "vfmacc.vv v17, v8, v1 \n\t"\
  307. "vle.v v5, (t4) \n\t"\
  308. "addi t4, t4, 16*4 \n\t"\
  309. "vfmacc.vv v18, v8, v2 \n\t"\
  310. "vle.v v6, (t5) \n\t"\
  311. "addi t5, t5, 16*4 \n\t"\
  312. "vfmacc.vv v19, v8, v3 \n\t"\
  313. "vle.v v7, (t6) \n\t"\
  314. "addi t6, t6, 16*4 \n\t"\
  315. "flw ft4, (%[PB]) \n\t"\
  316. "vfmacc.vv v20, v9, v0 \n\t"\
  317. "flw ft5, (t1) \n\t"\
  318. "vfmacc.vv v21, v9, v1 \n\t"\
  319. "vfmv.v.f v12, ft4 \n\t"\
  320. "vfmacc.vv v22, v9, v2 \n\t"\
  321. "addi t1, t1, 2*4 \n\t"\
  322. "vfmacc.vv v23, v9, v3 \n\t"\
  323. "addi %[PB], %[PB], 2*4 \n\t"\
  324. "vfmv.v.f v13, ft5 \n\t"
  325. #define KERNEL16x2_M2 \
  326. "vfmacc.vv v16, v12, v4 \n\t"\
  327. "vle.v v0, (%[PA]) \n\t"\
  328. "addi %[PA], %[PA], 16*4 \n\t"\
  329. "vfmacc.vv v17, v12, v5 \n\t"\
  330. "vle.v v1, (t4) \n\t"\
  331. "addi t4, t4, 16*4 \n\t"\
  332. "vfmacc.vv v18, v12, v6 \n\t"\
  333. "vle.v v2, (t5) \n\t"\
  334. "addi t5, t5, 16*4 \n\t"\
  335. "vfmacc.vv v19, v12, v7 \n\t"\
  336. "vle.v v3, (t6) \n\t"\
  337. "addi t6, t6, 16*4 \n\t"\
  338. "vfmacc.vv v20, v13, v4 \n\t"\
  339. "flw ft0, (%[PB]) \n\t"\
  340. "vfmacc.vv v21, v13, v5 \n\t"\
  341. "flw ft1, (t1) \n\t"\
  342. "vfmacc.vv v22, v13, v6 \n\t"\
  343. "vfmv.v.f v8, ft0 \n\t"\
  344. "vfmacc.vv v23, v13, v7 \n\t"\
  345. "addi %[PB], %[PB], 2*4 \n\t"\
  346. "addi t1, t1, 2*4 \n\t"\
  347. "vfmv.v.f v9, ft1 \n\t"
  348. #define KERNEL16x2_E \
  349. "vfmacc.vv v16, v12, v4 \n\t"\
  350. "vfmacc.vv v17, v12, v5 \n\t"\
  351. "vfmacc.vv v18, v12, v6 \n\t"\
  352. "vfmacc.vv v19, v12, v7 \n\t"\
  353. "vfmacc.vv v20, v13, v4 \n\t"\
  354. "vfmacc.vv v21, v13, v5 \n\t"\
  355. "vfmacc.vv v22, v13, v6 \n\t"\
  356. "vfmacc.vv v23, v13, v7 \n\t"
  357. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
  358. #ifdef TRMMKERNEL
  359. ,BLASLONG offset
  360. #endif
  361. )
  362. {
  363. BLASLONG i,j,k;
  364. FLOAT *C0,*C1,*C2,*C3;
  365. FLOAT *ptrba,*ptrbb, *tmpc;
  366. FLOAT loadb0,loadb1,loadb2,loadb3;
  367. FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
  368. FLOAT res0,res1,res2,res3;
  369. FLOAT res4,res5,res6,res7;
  370. FLOAT res8,res9,res10,res11;
  371. FLOAT res12,res13,res14,res15;
  372. for (j=0; j<bn/4; j+=1){
  373. C0 = C;
  374. C1 = C0+ldc;
  375. C2 = C1+ldc;
  376. C3 = C2+ldc;
  377. ptrba = ba;
  378. for(i=0; i<bm/16; i+=1){
  379. ptrbb = bb;
  380. //t0 for k
  381. //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
  382. //v0-v3,v4-v7 for A, t4-t6 for PA1-3
  383. //v16-v31 for temp C
  384. asm volatile(
  385. "vsetvli zero, zero, e32,m1 \n\t"
  386. "fmv.w.x ft11, zero \n\t"
  387. "mv t0, %[BK] \n\t"
  388. "vfmv.v.f v16, ft11 \n\t"
  389. "vfmv.v.f v17, ft11 \n\t"
  390. "vfmv.v.f v18, ft11 \n\t"
  391. "vfmv.v.f v19, ft11 \n\t"
  392. "vfmv.v.f v20, ft11 \n\t"
  393. "vfmv.v.f v21, ft11 \n\t"
  394. "vfmv.v.f v22, ft11 \n\t"
  395. "vfmv.v.f v23, ft11 \n\t"
  396. "vfmv.v.f v24, ft11 \n\t"
  397. "vfmv.v.f v25, ft11 \n\t"
  398. "vfmv.v.f v26, ft11 \n\t"
  399. "vfmv.v.f v27, ft11 \n\t"
  400. "vfmv.v.f v28, ft11 \n\t"
  401. "vfmv.v.f v29, ft11 \n\t"
  402. "vfmv.v.f v30, ft11 \n\t"
  403. "vfmv.v.f v31, ft11 \n\t"
  404. //unloop 8
  405. "srli t0, %[BK], 3 \n\t"
  406. "blez t0, M16x4_TAIL \n\t"
  407. //preloop
  408. KERNEL16x4_I
  409. KERNEL16x4_M2
  410. KERNEL16x4_M1
  411. KERNEL16x4_M2
  412. "addi t0, t0, -1 \n\t"
  413. "blez t0, M16x4_MAINLOOP_TAIL \n\t"
  414. ".align 4 \n\t"
  415. "M16x4_MAINLOOP: \n\t"
  416. KERNEL16x4_M1
  417. KERNEL16x4_M2
  418. KERNEL16x4_M1
  419. KERNEL16x4_M2
  420. KERNEL16x4_M1
  421. KERNEL16x4_M2
  422. KERNEL16x4_M1
  423. KERNEL16x4_M2
  424. "addi t0, t0, -1 \n\t"
  425. "bgtz t0, M16x4_MAINLOOP \n\t"
  426. "M16x4_MAINLOOP_TAIL: \n\t"
  427. KERNEL16x4_M1
  428. KERNEL16x4_M2
  429. KERNEL16x4_M1
  430. KERNEL16x4_E
  431. //tail
  432. "M16x4_TAIL: \n\t"
  433. "andi t0, %[BK], 7 \n\t"
  434. "blez t0, M16x4_SAVERESULT \n\t"
  435. "addi t4, %[PA], 4*4 \n\t"
  436. "addi t5, %[PA], 8*4 \n\t"
  437. "addi t6, %[PA], 12*4 \n\t"
  438. "addi t1, %[PB], 1*4 \n\t"
  439. "addi t2, %[PB], 2*4 \n\t"
  440. "addi t3, %[PB], 3*4 \n\t"
  441. ".align 4 \n\t"
  442. "M16x4_TAILLOOP: \n\t"
  443. "flw ft0, (%[PB]) \n\t"
  444. "addi %[PB], %[PB], 4*4 \n\t"
  445. "vle.v v0, (%[PA]) \n\t"
  446. "add %[PA], %[PA], 16*4 \n\t"
  447. "vle.v v1, (t4) \n\t"
  448. "addi t4, t4, 16*4 \n\t"
  449. "vfmv.v.f v8, ft0 \n\t"
  450. "flw ft1, (t1) \n\t"
  451. "addi t1, t1, 4*4 \n\t"
  452. "vle.v v2, (t5) \n\t"
  453. "addi t5, t5, 16*4 \n\t"
  454. "vle.v v3, (t6) \n\t"
  455. "addi t6, t6, 16*4 \n\t"
  456. "vfmacc.vv v16, v8, v0 \n\t"
  457. "flw ft2, (t2) \n\t"
  458. "addi t2, t2, 4*4 \n\t"
  459. "vfmacc.vv v17, v8, v1 \n\t"
  460. "vfmacc.vv v18, v8, v2 \n\t"
  461. "vfmv.v.f v9, ft1 \n\t"
  462. "vfmacc.vv v19, v8, v3 \n\t"
  463. "vfmacc.vv v20, v9, v0 \n\t"
  464. "flw ft3, (t3) \n\t"
  465. "addi t3, t3, 4*4 \n\t"
  466. "vfmacc.vv v21, v9, v1 \n\t"
  467. "vfmacc.vv v22, v9, v2 \n\t"
  468. "vfmv.v.f v10, ft2 \n\t"
  469. "vfmacc.vv v23, v9, v3 \n\t"
  470. "vfmv.v.f v11, ft3 \n\t"
  471. "vfmacc.vv v24, v10, v0 \n\t"
  472. "vfmacc.vv v25, v10, v1 \n\t"
  473. "vfmacc.vv v26, v10, v2 \n\t"
  474. "vfmacc.vv v27, v10, v3 \n\t"
  475. "vfmacc.vv v28, v11, v0 \n\t"
  476. "vfmacc.vv v29, v11, v1 \n\t"
  477. "vfmacc.vv v30, v11, v2 \n\t"
  478. "vfmacc.vv v31, v11, v3 \n\t"
  479. "addi t0, t0, -1 \n\t"
  480. "bgtz t0, M16x4_TAILLOOP \n\t"
  481. //Save result
  482. //load C
  483. "M16x4_SAVERESULT: \n\t"
  484. //use v8 to store alpha
  485. "vfmv.v.f v8, %[ALPHA] \n\t"
  486. "vle.v v0, (%[C0]) \n\t"
  487. "addi t4, %[C0], 4*4 \n\t"
  488. "vle.v v1, (%[C1]) \n\t"
  489. "addi t5, %[C1], 4*4 \n\t"
  490. "vle.v v2, (%[C2]) \n\t"
  491. "addi t6, %[C2], 4*4 \n\t"
  492. "vle.v v3, (%[C3]) \n\t"
  493. "addi t3, %[C3], 4*4 \n\t"
  494. //Multiply Alpha
  495. "vfmacc.vv v0, v8, v16 \n\t"
  496. "vle.v v4, (t4) \n\t"
  497. "vfmacc.vv v1, v8, v20 \n\t"
  498. "vle.v v5, (t5) \n\t"
  499. "vfmacc.vv v2, v8, v24 \n\t"
  500. "vle.v v6, (t6) \n\t"
  501. "vfmacc.vv v3, v8, v28 \n\t"
  502. "vle.v v7, (t3) \n\t"
  503. "vfmacc.vv v4, v8, v17 \n\t"
  504. "vse.v v0, (%[C0]) \n\t"
  505. "add %[C0], %[C0], 8*4 \n\t"
  506. "vfmacc.vv v5, v8, v21 \n\t"
  507. "vse.v v1, (%[C1]) \n\t"
  508. "add %[C1], %[C1], 8*4 \n\t"
  509. "vfmacc.vv v6, v8, v25 \n\t"
  510. "vse.v v2, (%[C2]) \n\t"
  511. "add %[C2], %[C2], 8*4 \n\t"
  512. "vfmacc.vv v7, v8, v29 \n\t"
  513. "vse.v v3, (%[C3]) \n\t"
  514. "add %[C3], %[C3], 8*4 \n\t"
  515. "vle.v v0, (%[C0]) \n\t"
  516. "vse.v v4, (t4) \n\t"
  517. "add t4, t4, 8*4 \n\t"
  518. "vle.v v1, (%[C1]) \n\t"
  519. "vse.v v5, (t5) \n\t"
  520. "add t5, t5, 8*4 \n\t"
  521. "vle.v v2, (%[C2]) \n\t"
  522. "vse.v v6, (t6) \n\t"
  523. "add t6, t6, 8*4 \n\t"
  524. "vle.v v3, (%[C3]) \n\t"
  525. "vse.v v7, (t3) \n\t"
  526. "add t3, t3, 8*4 \n\t"
  527. "vfmacc.vv v0, v8, v18 \n\t"
  528. "vle.v v4, (t4) \n\t"
  529. "vfmacc.vv v1, v8, v22 \n\t"
  530. "vle.v v5, (t5) \n\t"
  531. "vfmacc.vv v2, v8, v26 \n\t"
  532. "vle.v v6, (t6) \n\t"
  533. "vfmacc.vv v3, v8, v30 \n\t"
  534. "vle.v v7, (t3) \n\t"
  535. "vfmacc.vv v4, v8, v19 \n\t"
  536. "vse.v v0, (%[C0]) \n\t"
  537. "add %[C0], %[C0], 8*4 \n\t"
  538. "vfmacc.vv v5, v8, v23 \n\t"
  539. "vse.v v1, (%[C1]) \n\t"
  540. "add %[C1], %[C1], 8*4 \n\t"
  541. "vfmacc.vv v6, v8, v27 \n\t"
  542. "vse.v v2, (%[C2]) \n\t"
  543. "add %[C2], %[C2], 8*4 \n\t"
  544. "vfmacc.vv v7, v8, v31 \n\t"
  545. "vse.v v3, (%[C3]) \n\t"
  546. "add %[C3], %[C3], 8*4 \n\t"
  547. "vse.v v4, (t4) \n\t"
  548. "vse.v v5, (t5) \n\t"
  549. "vse.v v6, (t6) \n\t"
  550. "vse.v v7, (t3) \n\t"
  551. "M16x4_END: \n\t"
  552. :[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
  553. [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
  554. :[ALPHA]"f"(alpha), [BK]"r"(bk)
  555. :"cc", "t0", "t4","t5","t6","t3","t1","t2",
  556. "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
  557. "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
  558. "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
  559. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
  560. "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
  561. }
  562. if(bm&8){
  563. ptrbb = bb;
  564. //t0 for k
  565. //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
  566. //v0-v3,v4-v7 for A, t4-t6 for PA1-3
  567. //v16-v31 for temp C
  568. asm volatile(
  569. "vsetvli zero, zero, e32,m1 \n\t"
  570. "fmv.w.x ft11, zero \n\t"
  571. "mv t0, %[BK] \n\t"
  572. "vfmv.v.f v16, ft11 \n\t"
  573. "vfmv.v.f v17, ft11 \n\t"
  574. "vfmv.v.f v20, ft11 \n\t"
  575. "vfmv.v.f v21, ft11 \n\t"
  576. "vfmv.v.f v24, ft11 \n\t"
  577. "vfmv.v.f v25, ft11 \n\t"
  578. "vfmv.v.f v28, ft11 \n\t"
  579. "vfmv.v.f v29, ft11 \n\t"
  580. //unloop 8
  581. "srli t0, %[BK], 3 \n\t"
  582. "blez t0, M8x4_TAIL \n\t"
  583. //preloop
  584. KERNEL8x4_I
  585. KERNEL8x4_M2
  586. KERNEL8x4_M1
  587. KERNEL8x4_M2
  588. "addi t0, t0, -1 \n\t"
  589. "blez t0, M8x4_MAINLOOP_TAIL \n\t"
  590. ".align 4 \n\t"
  591. "M8x4_MAINLOOP: \n\t"
  592. KERNEL8x4_M1
  593. KERNEL8x4_M2
  594. KERNEL8x4_M1
  595. KERNEL8x4_M2
  596. KERNEL8x4_M1
  597. KERNEL8x4_M2
  598. KERNEL8x4_M1
  599. KERNEL8x4_M2
  600. "addi t0, t0, -1 \n\t"
  601. "bgtz t0, M8x4_MAINLOOP \n\t"
  602. "M8x4_MAINLOOP_TAIL: \n\t"
  603. KERNEL8x4_M1
  604. KERNEL8x4_M2
  605. KERNEL8x4_M1
  606. KERNEL8x4_E
  607. //tail
  608. "M8x4_TAIL: \n\t"
  609. "andi t0, %[BK], 7 \n\t"
  610. "blez t0, M8x4_SAVERESULT \n\t"
  611. "addi t4, %[PA], 4*4 \n\t"
  612. "addi t1, %[PB], 1*4 \n\t"
  613. "addi t2, %[PB], 2*4 \n\t"
  614. "addi t3, %[PB], 3*4 \n\t"
  615. ".align 4 \n\t"
  616. "M8x4_TAILLOOP: \n\t"
  617. "flw ft0, (%[PB]) \n\t"
  618. "addi %[PB], %[PB], 4*4 \n\t"
  619. "vle.v v0, (%[PA]) \n\t"
  620. "add %[PA], %[PA], 8*4 \n\t"
  621. "vle.v v1, (t4) \n\t"
  622. "addi t4, t4, 8*4 \n\t"
  623. "vfmv.v.f v8, ft0 \n\t"
  624. "flw ft1, (t1) \n\t"
  625. "addi t1, t1, 4*4 \n\t"
  626. "vfmacc.vv v16, v8, v0 \n\t"
  627. "flw ft2, (t2) \n\t"
  628. "addi t2, t2, 4*4 \n\t"
  629. "vfmacc.vv v17, v8, v1 \n\t"
  630. "vfmv.v.f v9, ft1 \n\t"
  631. "vfmacc.vv v20, v9, v0 \n\t"
  632. "flw ft3, (t3) \n\t"
  633. "addi t3, t3, 4*4 \n\t"
  634. "vfmacc.vv v21, v9, v1 \n\t"
  635. "vfmv.v.f v10, ft2 \n\t"
  636. "vfmv.v.f v11, ft3 \n\t"
  637. "vfmacc.vv v24, v10, v0 \n\t"
  638. "vfmacc.vv v25, v10, v1 \n\t"
  639. "vfmacc.vv v28, v11, v0 \n\t"
  640. "vfmacc.vv v29, v11, v1 \n\t"
  641. "addi t0, t0, -1 \n\t"
  642. "bgtz t0, M8x4_TAILLOOP \n\t"
  643. //Save result
  644. //load C
  645. "M8x4_SAVERESULT: \n\t"
  646. //use v8 to store alpha
  647. "vfmv.v.f v8, %[ALPHA] \n\t"
  648. "vle.v v0, (%[C0]) \n\t"
  649. "addi t4, %[C0], 4*4 \n\t"
  650. "vle.v v1, (%[C1]) \n\t"
  651. "addi t5, %[C1], 4*4 \n\t"
  652. "vle.v v2, (%[C2]) \n\t"
  653. "addi t6, %[C2], 4*4 \n\t"
  654. "vle.v v3, (%[C3]) \n\t"
  655. "addi t3, %[C3], 4*4 \n\t"
  656. //Multiply Alpha
  657. "vfmacc.vv v0, v8, v16 \n\t"
  658. "vle.v v4, (t4) \n\t"
  659. "vfmacc.vv v1, v8, v20 \n\t"
  660. "vle.v v5, (t5) \n\t"
  661. "vfmacc.vv v2, v8, v24 \n\t"
  662. "vle.v v6, (t6) \n\t"
  663. "vfmacc.vv v3, v8, v28 \n\t"
  664. "vle.v v7, (t3) \n\t"
  665. "vfmacc.vv v4, v8, v17 \n\t"
  666. "vse.v v0, (%[C0]) \n\t"
  667. "add %[C0], %[C0], 8*4 \n\t"
  668. "vfmacc.vv v5, v8, v21 \n\t"
  669. "vse.v v1, (%[C1]) \n\t"
  670. "add %[C1], %[C1], 8*4 \n\t"
  671. "vfmacc.vv v6, v8, v25 \n\t"
  672. "vse.v v2, (%[C2]) \n\t"
  673. "add %[C2], %[C2], 8*4 \n\t"
  674. "vfmacc.vv v7, v8, v29 \n\t"
  675. "vse.v v3, (%[C3]) \n\t"
  676. "add %[C3], %[C3], 8*4 \n\t"
  677. "vse.v v4, (t4) \n\t"
  678. "vse.v v5, (t5) \n\t"
  679. "vse.v v6, (t6) \n\t"
  680. "vse.v v7, (t3) \n\t"
  681. "M8x4_END: \n\t"
  682. :[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
  683. [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
  684. :[ALPHA]"f"(alpha), [BK]"r"(bk)
  685. :"cc", "t0", "t4","t5","t6","t3","t1","t2",
  686. "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
  687. "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
  688. "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
  689. "v16", "v17", "v20", "v21",
  690. "v24", "v25", "v28", "v29");
  691. }
  692. if(bm&4){
  693. ptrbb = bb;
  694. res0 = 0;
  695. res1 = 0;
  696. res2 = 0;
  697. res3 = 0;
  698. res4 = 0;
  699. res5 = 0;
  700. res6 = 0;
  701. res7 = 0;
  702. res8 = 0;
  703. res9 = 0;
  704. res10 = 0;
  705. res11 = 0;
  706. res12 = 0;
  707. res13 = 0;
  708. res14 = 0;
  709. res15 = 0;
  710. for(k=0; k<bk; k+=1){
  711. loadb0 = ptrbb[0];
  712. loadb1 = ptrbb[1];
  713. load0 = ptrba[0];
  714. load1 = ptrba[1];
  715. load2 = ptrba[2];
  716. load3 = ptrba[3];
  717. res0 = res0 + load0 * loadb0;
  718. res1 = res1 + load1 * loadb0;
  719. res2 = res2 + load2 * loadb0;
  720. res3 = res3 + load3 * loadb0;
  721. res4 = res4 + load0 * loadb1;
  722. res5 = res5 + load1 * loadb1;
  723. res6 = res6 + load2 * loadb1;
  724. res7 = res7 + load3 * loadb1;
  725. loadb2 = ptrbb[2];
  726. loadb3 = ptrbb[3];
  727. res8 = res8 + load0 * loadb2;
  728. res9 = res9 + load1 * loadb2;
  729. res10 = res10 + load2 * loadb2;
  730. res11 = res11 + load3 * loadb2;
  731. res12 = res12 + load0 * loadb3;
  732. res13 = res13 + load1 * loadb3;
  733. res14 = res14 + load2 * loadb3;
  734. res15 = res15 + load3 * loadb3;
  735. ptrba += 4;
  736. ptrbb += 4;
  737. }
  738. res0 = res0 * alpha;
  739. res1 = res1 * alpha;
  740. res2 = res2 * alpha;
  741. res3 = res3 * alpha;
  742. res4 = res4 * alpha;
  743. res5 = res5 * alpha;
  744. res6 = res6 * alpha;
  745. res7 = res7 * alpha;
  746. res8 = res8 * alpha;
  747. res9 = res9 * alpha;
  748. res10 = res10 * alpha;
  749. res11 = res11 * alpha;
  750. res12 = res12 * alpha;
  751. res13 = res13 * alpha;
  752. res14 = res14 * alpha;
  753. res15 = res15 * alpha;
  754. C0[0] += res0;
  755. C0[1] += res1;
  756. C0[2] += res2;
  757. C0[3] += res3;
  758. C1[0] += res4;
  759. C1[1] += res5;
  760. C1[2] += res6;
  761. C1[3] += res7;
  762. C2[0] += res8;
  763. C2[1] += res9;
  764. C2[2] += res10;
  765. C2[3] += res11;
  766. C3[0] += res12;
  767. C3[1] += res13;
  768. C3[2] += res14;
  769. C3[3] += res15;
  770. C0 += 4;
  771. C1 += 4;
  772. C2 += 4;
  773. C3 += 4;
  774. }
  775. if(bm&2){
  776. ptrbb = bb;
  777. res0 = 0;
  778. res1 = 0;
  779. res4 = 0;
  780. res5 = 0;
  781. res8 = 0;
  782. res9 = 0;
  783. res12 = 0;
  784. res13 = 0;
  785. for(k=0; k<bk; k+=1){
  786. loadb0 = ptrbb[0];
  787. loadb1 = ptrbb[1];
  788. load0 = ptrba[0];
  789. load1 = ptrba[1];
  790. res0 = res0 + load0 * loadb0;
  791. res1 = res1 + load1 * loadb0;
  792. res4 = res4 + load0 * loadb1;
  793. res5 = res5 + load1 * loadb1;
  794. loadb2 = ptrbb[2];
  795. loadb3 = ptrbb[3];
  796. res8 = res8 + load0 * loadb2;
  797. res9 = res9 + load1 * loadb2;
  798. res12 = res12 + load0 * loadb3;
  799. res13 = res13 + load1 * loadb3;
  800. ptrba += 2;
  801. ptrbb += 4;
  802. }
  803. res0 = res0 * alpha;
  804. res1 = res1 * alpha;
  805. res4 = res4 * alpha;
  806. res5 = res5 * alpha;
  807. res8 = res8 * alpha;
  808. res9 = res9 * alpha;
  809. res12 = res12 * alpha;
  810. res13 = res13 * alpha;
  811. C0[0] += res0;
  812. C0[1] += res1;
  813. C1[0] += res4;
  814. C1[1] += res5;
  815. C2[0] += res8;
  816. C2[1] += res9;
  817. C3[0] += res12;
  818. C3[1] += res13;
  819. C0 += 2;
  820. C1 += 2;
  821. C2 += 2;
  822. C3 += 2;
  823. }
  824. if(bm&1){
  825. ptrbb = bb;
  826. //t0 for k
  827. //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
  828. //v0-v3,v4-v7 for A, t4-t6 for PA1-3
  829. //v16-v31 for temp C
  830. FLOAT tmp[4];
  831. tmpc=tmp;
  832. //t1-t3 for PB
  833. //v0-v4 for A, v8-v11 for B
  834. //v16-v19 for C
  835. asm volatile(
  836. "vsetvli zero, zero, e32,m1 \n\t"
  837. "fmv.w.x ft11, zero \n\t"
  838. "vfmv.v.f v16, ft11 \n\t"
  839. "vfmv.v.f v17, ft11 \n\t"
  840. "vfmv.v.f v18, ft11 \n\t"
  841. "vfmv.v.f v19, ft11 \n\t"
  842. //unloop 4
  843. "srli t0, %[BK], 2 \n\t"
  844. "blez t0, M1x4_TAIL \n\t"
  845. "addi t1, %[PB], 4*4 \n\t"
  846. "addi t2, %[PB], 8*4 \n\t"
  847. "addi t3, %[PB], 12*4 \n\t"
  848. ".align 4 \n\t"
  849. "M1x4_MAINLOOP: \n\t"
  850. "vle.v v4, (%[PA]) \n\t"
  851. "addi %[PA], %[PA], 4*4 \n\t"
  852. "vrgather.vi v0, v4, 0 \n\t"
  853. "vle.v v8, (%[PB]) \n\t"
  854. "addi %[PB], %[PB], 16*4 \n\t"
  855. "vrgather.vi v1, v4, 1 \n\t"
  856. "vle.v v9, (t1) \n\t"
  857. "addi t1, t1, 16*4 \n\t"
  858. "vrgather.vi v2, v4, 2 \n\t"
  859. "vle.v v10, (t2) \n\t"
  860. "addi t2, t2, 16*4 \n\t"
  861. "vrgather.vi v3, v4, 3 \n\t"
  862. "vle.v v11, (t3) \n\t"
  863. "addi t3, t3, 16*4 \n\t"
  864. "vfmacc.vv v16, v8, v0 \n\t"
  865. "vfmacc.vv v17, v9, v1 \n\t"
  866. "vfmacc.vv v18, v10, v2 \n\t"
  867. "vfmacc.vv v19, v11, v3 \n\t"
  868. "addi t0, t0, -1 \n\t"
  869. "bgtz t0, M1x4_MAINLOOP \n\t"
  870. "M1x4_TAIL: \n\t"
  871. "andi t0, %[BK], 3 \n\t"
  872. "blez t0, M1x4_SAVERESULT \n\t"
  873. "M1x4_TAILLOOP: \n\t"
  874. "flw ft0, (%[PA]) \n\t"
  875. "addi %[PA], %[PA], 1*4 \n\t"
  876. "vle.v v8, (%[PB]) \n\t"
  877. "addi %[PB], %[PB], 4*4 \n\t"
  878. "vfmv.v.f v0, ft0 \n\t"
  879. "vfmacc.vv v16, v8, v0 \n\t"
  880. "addi t0, t0, -1 \n\t"
  881. "bgtz t0, M1x4_TAILLOOP \n\t"
  882. "M1x4_SAVERESULT: \n\t"
  883. //merge v16-v19
  884. "vfadd.vv v16, v16, v17 \n\t"
  885. "vfadd.vv v18, v18, v19 \n\t"
  886. "vfadd.vv v16, v16, v18 \n\t"
  887. "vfmv.v.f v8, %[ALPHA] \n\t"
  888. "vfmul.vv v16, v8, v16 \n\t"
  889. "vse.v v16, (%[TMP_C]) \n\t"
  890. "M1x4_END: \n\t"
  891. :[TMP_C]"+r"(tmpc),
  892. [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
  893. :[ALPHA]"f"(alpha), [BK]"r"(bk)
  894. :"cc", "t0", "t3","t1","t2",
  895. "ft0", "ft11",
  896. "v0", "v1", "v2", "v3","v4",
  897. "v8", "v9", "v10", "v11",
  898. "v16", "v17","v18", "v19"
  899. );
  900. C0[0] += tmp[0];
  901. C1[0] += tmp[1];
  902. C2[0] += tmp[2];
  903. C3[0] += tmp[3];
  904. /* don't need move c point
  905. C0 += 1;
  906. C1 += 1;
  907. C2 += 1;
  908. C3 += 1;
  909. */
  910. }
  911. k = bk<<2;
  912. bb = bb+k;
  913. i = ldc<<2;
  914. C = C+i;
  915. }
  916. if(bn&2){
  917. C0 = C;
  918. C1 = C0+ldc;
  919. ptrba = ba;
  920. for(i=0; i<bm/16; i+=1){
  921. ptrbb = bb;
  922. asm volatile(
  923. "vsetvli zero, zero, e32,m1 \n\t"
  924. "fmv.w.x ft11, zero \n\t"
  925. "mv t0, %[BK] \n\t"
  926. "vfmv.v.f v16, ft11 \n\t"
  927. "vfmv.v.f v17, ft11 \n\t"
  928. "vfmv.v.f v18, ft11 \n\t"
  929. "vfmv.v.f v19, ft11 \n\t"
  930. "vfmv.v.f v20, ft11 \n\t"
  931. "vfmv.v.f v21, ft11 \n\t"
  932. "vfmv.v.f v22, ft11 \n\t"
  933. "vfmv.v.f v23, ft11 \n\t"
  934. //unloop 8
  935. "srli t0, %[BK], 3 \n\t"
  936. "blez t0, M16x2_TAIL \n\t"
  937. //preloop
  938. KERNEL16x2_I
  939. KERNEL16x2_M2
  940. KERNEL16x2_M1
  941. KERNEL16x2_M2
  942. "addi t0, t0, -1 \n\t"
  943. "blez t0, M16x2_MAINLOOP_TAIL \n\t"
  944. ".align 4 \n\t"
  945. "M16x2_MAINLOOP: \n\t"
  946. KERNEL16x2_M1
  947. KERNEL16x2_M2
  948. KERNEL16x2_M1
  949. KERNEL16x2_M2
  950. KERNEL16x2_M1
  951. KERNEL16x2_M2
  952. KERNEL16x2_M1
  953. KERNEL16x2_M2
  954. "addi t0, t0, -1 \n\t"
  955. "bgtz t0, M16x2_MAINLOOP \n\t"
  956. "M16x2_MAINLOOP_TAIL: \n\t"
  957. KERNEL16x2_M1
  958. KERNEL16x2_M2
  959. KERNEL16x2_M1
  960. KERNEL16x2_E
  961. //tail
  962. "M16x2_TAIL: \n\t"
  963. "andi t0, %[BK], 7 \n\t"
  964. "blez t0, M16x2_SAVERESULT \n\t"
  965. "addi t4, %[PA], 4*4 \n\t"
  966. "addi t5, %[PA], 8*4 \n\t"
  967. "addi t6, %[PA], 12*4 \n\t"
  968. "addi t1, %[PB], 1*4 \n\t"
  969. ".align 4 \n\t"
  970. "M16x2_TAILLOOP: \n\t"
  971. "flw ft0, (%[PB]) \n\t"
  972. "addi %[PB], %[PB], 2*4 \n\t"
  973. "vle.v v0, (%[PA]) \n\t"
  974. "add %[PA], %[PA], 16*4 \n\t"
  975. "vle.v v1, (t4) \n\t"
  976. "addi t4, t4, 16*4 \n\t"
  977. "vfmv.v.f v8, ft0 \n\t"
  978. "flw ft1, (t1) \n\t"
  979. "addi t1, t1, 2*4 \n\t"
  980. "vle.v v2, (t5) \n\t"
  981. "addi t5, t5, 16*4 \n\t"
  982. "vle.v v3, (t6) \n\t"
  983. "addi t6, t6, 16*4 \n\t"
  984. "vfmv.v.f v9, ft1 \n\t"
  985. "vfmacc.vv v16, v8, v0 \n\t"
  986. "vfmacc.vv v17, v8, v1 \n\t"
  987. "vfmacc.vv v18, v8, v2 \n\t"
  988. "vfmacc.vv v19, v8, v3 \n\t"
  989. "vfmacc.vv v20, v9, v0 \n\t"
  990. "vfmacc.vv v21, v9, v1 \n\t"
  991. "vfmacc.vv v22, v9, v2 \n\t"
  992. "vfmacc.vv v23, v9, v3 \n\t"
  993. "addi t0, t0, -1 \n\t"
  994. "bgtz t0, M16x2_TAILLOOP \n\t"
  995. //Save result
  996. //load C
  997. "M16x2_SAVERESULT: \n\t"
  998. //use v8 to store alpha
  999. "vfmv.v.f v8, %[ALPHA] \n\t"
  1000. "vle.v v0, (%[C0]) \n\t"
  1001. "addi t4, %[C0], 4*4 \n\t"
  1002. "vle.v v1, (%[C1]) \n\t"
  1003. "addi t5, %[C1], 4*4 \n\t"
  1004. //Multiply Alpha
  1005. "vfmacc.vv v0, v8, v16 \n\t"
  1006. "vle.v v4, (t4) \n\t"
  1007. "vfmacc.vv v1, v8, v20 \n\t"
  1008. "vle.v v5, (t5) \n\t"
  1009. "vfmacc.vv v4, v8, v17 \n\t"
  1010. "vse.v v0, (%[C0]) \n\t"
  1011. "add %[C0], %[C0], 8*4 \n\t"
  1012. "vfmacc.vv v5, v8, v21 \n\t"
  1013. "vse.v v1, (%[C1]) \n\t"
  1014. "add %[C1], %[C1], 8*4 \n\t"
  1015. "vle.v v0, (%[C0]) \n\t"
  1016. "vse.v v4, (t4) \n\t"
  1017. "add t4, t4, 8*4 \n\t"
  1018. "vle.v v1, (%[C1]) \n\t"
  1019. "vse.v v5, (t5) \n\t"
  1020. "add t5, t5, 8*4 \n\t"
  1021. "vfmacc.vv v0, v8, v18 \n\t"
  1022. "vle.v v4, (t4) \n\t"
  1023. "vfmacc.vv v1, v8, v22 \n\t"
  1024. "vle.v v5, (t5) \n\t"
  1025. "vfmacc.vv v4, v8, v19 \n\t"
  1026. "vse.v v0, (%[C0]) \n\t"
  1027. "add %[C0], %[C0], 8*4 \n\t"
  1028. "vfmacc.vv v5, v8, v23 \n\t"
  1029. "vse.v v1, (%[C1]) \n\t"
  1030. "add %[C1], %[C1], 8*4 \n\t"
  1031. "vse.v v4, (t4) \n\t"
  1032. "vse.v v5, (t5) \n\t"
  1033. "M16x2_END: \n\t"
  1034. :[C0]"+r"(C0),[C1]"+r"(C1),
  1035. [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
  1036. :[ALPHA]"f"(alpha), [BK]"r"(bk)
  1037. :"cc", "t0", "t4","t5","t6","t3","t1","t2",
  1038. "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
  1039. "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
  1040. "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
  1041. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
  1042. }
  1043. if(bm&8){
  1044. ptrbb = bb;
  1045. res0 = 0;
  1046. res1 = 0;
  1047. res2 = 0;
  1048. res3 = 0;
  1049. res4 = 0;
  1050. res5 = 0;
  1051. res6 = 0;
  1052. res7 = 0;
  1053. res8 = 0;
  1054. res9 = 0;
  1055. res10 = 0;
  1056. res11 = 0;
  1057. res12 = 0;
  1058. res13 = 0;
  1059. res14 = 0;
  1060. res15 = 0;
  1061. for(k=0; k<bk; k+=1){
  1062. loadb0 = ptrbb[0];
  1063. loadb1 = ptrbb[1];
  1064. load0 = ptrba[0];
  1065. load1 = ptrba[1];
  1066. load2 = ptrba[2];
  1067. load3 = ptrba[3];
  1068. load4 = ptrba[4];
  1069. load5 = ptrba[5];
  1070. load6 = ptrba[6];
  1071. load7 = ptrba[7];
  1072. res0 = res0 + load0 * loadb0;
  1073. res1 = res1 + load1 * loadb0;
  1074. res2 = res2 + load2 * loadb0;
  1075. res3 = res3 + load3 * loadb0;
  1076. res4 = res4 + load4 * loadb0;
  1077. res5 = res5 + load5 * loadb0;
  1078. res6 = res6 + load6 * loadb0;
  1079. res7 = res7 + load7 * loadb0;
  1080. res8 = res8 + load0 * loadb1;
  1081. res9 = res9 + load1 * loadb1;
  1082. res10 = res10 + load2 * loadb1;
  1083. res11 = res11 + load3 * loadb1;
  1084. res12 = res12 + load4 * loadb1;
  1085. res13 = res13 + load5 * loadb1;
  1086. res14 = res14 + load6 * loadb1;
  1087. res15 = res15 + load7 * loadb1;
  1088. ptrba += 8;
  1089. ptrbb += 2;
  1090. }
  1091. res0 = res0 * alpha;
  1092. res1 = res1 * alpha;
  1093. res2 = res2 * alpha;
  1094. res3 = res3 * alpha;
  1095. res4 = res4 * alpha;
  1096. res5 = res5 * alpha;
  1097. res6 = res6 * alpha;
  1098. res7 = res7 * alpha;
  1099. res8 = res8 * alpha;
  1100. res9 = res9 * alpha;
  1101. res10 = res10 * alpha;
  1102. res11 = res11 * alpha;
  1103. res12 = res12 * alpha;
  1104. res13 = res13 * alpha;
  1105. res14 = res14 * alpha;
  1106. res15 = res15 * alpha;
  1107. C0[0] += res0;
  1108. C0[1] += res1;
  1109. C0[2] += res2;
  1110. C0[3] += res3;
  1111. C0[4] += res4;
  1112. C0[5] += res5;
  1113. C0[6] += res6;
  1114. C0[7] += res7;
  1115. C1[0] += res8;
  1116. C1[1] += res9;
  1117. C1[2] += res10;
  1118. C1[3] += res11;
  1119. C1[4] += res12;
  1120. C1[5] += res13;
  1121. C1[6] += res14;
  1122. C1[7] += res15;
  1123. C0 += 8;
  1124. C1 += 8;
  1125. }
  1126. if(bm&4){
  1127. ptrbb = bb;
  1128. res0 = 0;
  1129. res1 = 0;
  1130. res2 = 0;
  1131. res3 = 0;
  1132. res8 = 0;
  1133. res9 = 0;
  1134. res10 = 0;
  1135. res11 = 0;
  1136. for(k=0; k<bk; k+=1){
  1137. loadb0 = ptrbb[0];
  1138. loadb1 = ptrbb[1];
  1139. load0 = ptrba[0];
  1140. load1 = ptrba[1];
  1141. load2 = ptrba[2];
  1142. load3 = ptrba[3];
  1143. res0 = res0 + load0 * loadb0;
  1144. res1 = res1 + load1 * loadb0;
  1145. res2 = res2 + load2 * loadb0;
  1146. res3 = res3 + load3 * loadb0;
  1147. res8 = res8 + load0 * loadb1;
  1148. res9 = res9 + load1 * loadb1;
  1149. res10 = res10 + load2 * loadb1;
  1150. res11 = res11 + load3 * loadb1;
  1151. ptrba += 4;
  1152. ptrbb += 2;
  1153. }
  1154. res0 = res0 * alpha;
  1155. res1 = res1 * alpha;
  1156. res2 = res2 * alpha;
  1157. res3 = res3 * alpha;
  1158. res8 = res8 * alpha;
  1159. res9 = res9 * alpha;
  1160. res10 = res10 * alpha;
  1161. res11 = res11 * alpha;
  1162. C0[0] += res0;
  1163. C0[1] += res1;
  1164. C0[2] += res2;
  1165. C0[3] += res3;
  1166. C1[0] += res8;
  1167. C1[1] += res9;
  1168. C1[2] += res10;
  1169. C1[3] += res11;
  1170. C0 += 4;
  1171. C1 += 4;
  1172. }
  1173. if(bm&2){
  1174. ptrbb = bb;
  1175. res0 = 0;
  1176. res1 = 0;
  1177. res8 = 0;
  1178. res9 = 0;
  1179. for(k=0; k<bk; k+=1){
  1180. loadb0 = ptrbb[0];
  1181. loadb1 = ptrbb[1];
  1182. load0 = ptrba[0];
  1183. load1 = ptrba[1];
  1184. res0 = res0 + load0 * loadb0;
  1185. res1 = res1 + load1 * loadb0;
  1186. res8 = res8 + load0 * loadb1;
  1187. res9 = res9 + load1 * loadb1;
  1188. ptrba += 2;
  1189. ptrbb += 2;
  1190. }
  1191. res0 = res0 * alpha;
  1192. res1 = res1 * alpha;
  1193. res8 = res8 * alpha;
  1194. res9 = res9 * alpha;
  1195. C0[0] += res0;
  1196. C0[1] += res1;
  1197. C1[0] += res8;
  1198. C1[1] += res9;
  1199. C0 += 2;
  1200. C1 += 2;
  1201. }
  1202. if(bm&1){
  1203. ptrbb = bb;
  1204. res0 = 0;
  1205. res8 = 0;
  1206. for(k=0; k<bk; k+=1){
  1207. loadb0 = ptrbb[0];
  1208. loadb1 = ptrbb[1];
  1209. load0 = ptrba[0];
  1210. res0 = res0 + load0 * loadb0;
  1211. res8 = res8 + load0 * loadb1;
  1212. ptrba += 1;
  1213. ptrbb += 2;
  1214. }
  1215. res0 = res0 * alpha;
  1216. res8 = res8 * alpha;
  1217. C0[0] += res0;
  1218. C1[0] += res8;
  1219. C0 += 1;
  1220. C1 += 1;
  1221. }
  1222. k = bk<<1;
  1223. bb = bb+k;
  1224. i = ldc<<1;
  1225. C = C+i;
  1226. }
  1227. if (bn&1){
  1228. C0 = C;
  1229. ptrba = ba;
  1230. for(i=0; i<bm/16; i+=1){
  1231. ptrbb = bb;
  1232. res0 = 0;
  1233. res1 = 0;
  1234. res2 = 0;
  1235. res3 = 0;
  1236. res4 = 0;
  1237. res5 = 0;
  1238. res6 = 0;
  1239. res7 = 0;
  1240. res8 = 0;
  1241. res9 = 0;
  1242. res10 = 0;
  1243. res11 = 0;
  1244. res12 = 0;
  1245. res13 = 0;
  1246. res14 = 0;
  1247. res15 = 0;
  1248. for(k=0; k<bk; k+=1){
  1249. loadb0 = ptrbb[0];
  1250. res0 = res0 + ptrba[0] * loadb0;
  1251. res1 = res1 + ptrba[1] * loadb0;
  1252. res2 = res2 + ptrba[2] * loadb0;
  1253. res3 = res3 + ptrba[3] * loadb0;
  1254. res4 = res4 + ptrba[4] * loadb0;
  1255. res5 = res5 + ptrba[5] * loadb0;
  1256. res6 = res6 + ptrba[6] * loadb0;
  1257. res7 = res7 + ptrba[7] * loadb0;
  1258. res8 = res8 + ptrba[8] * loadb0;
  1259. res9 = res9 + ptrba[9] * loadb0;
  1260. res10 = res10 + ptrba[10] * loadb0;
  1261. res11 = res11 + ptrba[11] * loadb0;
  1262. res12 = res12 + ptrba[12] * loadb0;
  1263. res13 = res13 + ptrba[13] * loadb0;
  1264. res14 = res14 + ptrba[14] * loadb0;
  1265. res15 = res15 + ptrba[15] * loadb0;
  1266. ptrba += 16;
  1267. ptrbb += 1;
  1268. }
  1269. res0 = res0 * alpha;
  1270. res1 = res1 * alpha;
  1271. res2 = res2 * alpha;
  1272. res3 = res3 * alpha;
  1273. res4 = res4 * alpha;
  1274. res5 = res5 * alpha;
  1275. res6 = res6 * alpha;
  1276. res7 = res7 * alpha;
  1277. res8 = res8 * alpha;
  1278. res9 = res9 * alpha;
  1279. res10 = res10 * alpha;
  1280. res11 = res11 * alpha;
  1281. res12 = res12 * alpha;
  1282. res13 = res13 * alpha;
  1283. res14 = res14 * alpha;
  1284. res15 = res15 * alpha;
  1285. C0[0] += res0;
  1286. C0[1] += res1;
  1287. C0[2] += res2;
  1288. C0[3] += res3;
  1289. C0[4] += res4;
  1290. C0[5] += res5;
  1291. C0[6] += res6;
  1292. C0[7] += res7;
  1293. C0[8] += res8;
  1294. C0[9] += res9;
  1295. C0[10] += res10;
  1296. C0[11] += res11;
  1297. C0[12] += res12;
  1298. C0[13] += res13;
  1299. C0[14] += res14;
  1300. C0[15] += res15;
  1301. C0 += 16;
  1302. }
  1303. if(bm&8){
  1304. ptrbb = bb;
  1305. res0 = 0;
  1306. res1 = 0;
  1307. res2 = 0;
  1308. res3 = 0;
  1309. res4 = 0;
  1310. res5 = 0;
  1311. res6 = 0;
  1312. res7 = 0;
  1313. for(k=0; k<bk; k+=1){
  1314. loadb0 = ptrbb[0];
  1315. res0 = res0 + ptrba[0] * loadb0;
  1316. res1 = res1 + ptrba[1] * loadb0;
  1317. res2 = res2 + ptrba[2] * loadb0;
  1318. res3 = res3 + ptrba[3] * loadb0;
  1319. res4 = res4 + ptrba[4] * loadb0;
  1320. res5 = res5 + ptrba[5] * loadb0;
  1321. res6 = res6 + ptrba[6] * loadb0;
  1322. res7 = res7 + ptrba[7] * loadb0;
  1323. ptrba += 8;
  1324. ptrbb += 1;
  1325. }
  1326. res0 = res0 * alpha;
  1327. res1 = res1 * alpha;
  1328. res2 = res2 * alpha;
  1329. res3 = res3 * alpha;
  1330. res4 = res4 * alpha;
  1331. res5 = res5 * alpha;
  1332. res6 = res6 * alpha;
  1333. res7 = res7 * alpha;
  1334. C0[0] += res0;
  1335. C0[1] += res1;
  1336. C0[2] += res2;
  1337. C0[3] += res3;
  1338. C0[4] += res4;
  1339. C0[5] += res5;
  1340. C0[6] += res6;
  1341. C0[7] += res7;
  1342. C0 += 8;
  1343. }
  1344. if(bm&4){
  1345. ptrbb = bb;
  1346. res0 = 0;
  1347. res1 = 0;
  1348. res2 = 0;
  1349. res3 = 0;
  1350. for(k=0; k<bk; k+=1){
  1351. loadb0 = ptrbb[0];
  1352. res0 = res0 + ptrba[0] * loadb0;
  1353. res1 = res1 + ptrba[1] * loadb0;
  1354. res2 = res2 + ptrba[2] * loadb0;
  1355. res3 = res3 + ptrba[3] * loadb0;
  1356. ptrba += 4;
  1357. ptrbb += 1;
  1358. }
  1359. res0 = res0 * alpha;
  1360. res1 = res1 * alpha;
  1361. res2 = res2 * alpha;
  1362. res3 = res3 * alpha;
  1363. C0[0] += res0;
  1364. C0[1] += res1;
  1365. C0[2] += res2;
  1366. C0[3] += res3;
  1367. C0 += 4;
  1368. }
  1369. if(bm&2){
  1370. ptrbb = bb;
  1371. res0 = 0;
  1372. res1 = 0;
  1373. for(k=0; k<bk; k+=1){
  1374. loadb0 = ptrbb[0];
  1375. res0 = res0 + ptrba[0] * loadb0;
  1376. res1 = res1 + ptrba[1] * loadb0;
  1377. ptrba += 2;
  1378. ptrbb += 1;
  1379. }
  1380. res0 = res0 * alpha;
  1381. res1 = res1 * alpha;
  1382. C0[0] += res0;
  1383. C0[1] += res1;
  1384. C0 += 2;
  1385. }
  1386. if(bm&1){
  1387. ptrbb = bb;
  1388. res0 = 0;
  1389. for(k=0; k<bk; k+=1){
  1390. loadb0 = ptrbb[0];
  1391. res0 = res0 + ptrba[0] * loadb0;
  1392. ptrba += 1;
  1393. ptrbb += 1;
  1394. }
  1395. res0 = res0 * alpha;
  1396. C0[0] += res0;
  1397. C0 += 1;
  1398. }
  1399. k = bk;
  1400. bb = bb+k;
  1401. C = C+ldc;
  1402. }
  1403. return 0;
  1404. }