You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x4_msa.c 77 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. #define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \
  30. { \
  31. LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
  32. LD_SP2_INC(pb0, 4, src_b0, src_b1); \
  33. \
  34. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  35. PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
  36. \
  37. /* 0th col */ \
  38. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  39. res0_r OP0## = src_a0r * src_br; \
  40. res0_r OP1## = src_a0i * src_bi; \
  41. res0_i OP2## = (OP4 src_a0r) * src_bi; \
  42. res0_i OP3## = src_a0i * src_br; \
  43. \
  44. res1_r OP0## = src_a1r * src_br; \
  45. res1_r OP1## = src_a1i * src_bi; \
  46. res1_i OP2## = (OP4 src_a1r) * src_bi; \
  47. res1_i OP3## = src_a1i * src_br; \
  48. \
  49. /* 1st col */ \
  50. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  51. res2_r OP0## = src_a0r * src_br; \
  52. res2_r OP1## = src_a0i * src_bi; \
  53. res2_i OP2## = (OP4 src_a0r) * src_bi; \
  54. res2_i OP3## = src_a0i * src_br; \
  55. \
  56. res3_r OP0## = src_a1r * src_br; \
  57. res3_r OP1## = src_a1i * src_bi; \
  58. res3_i OP2## = (OP4 src_a1r) * src_bi; \
  59. res3_i OP3## = src_a1i * src_br; \
  60. \
  61. /* 2nd col */ \
  62. SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
  63. res4_r OP0## = src_a0r * src_br; \
  64. res4_r OP1## = src_a0i * src_bi; \
  65. res4_i OP2## = (OP4 src_a0r) * src_bi; \
  66. res4_i OP3## = src_a0i * src_br; \
  67. \
  68. res5_r OP0## = src_a1r * src_br; \
  69. res5_r OP1## = src_a1i * src_bi; \
  70. res5_i OP2## = (OP4 src_a1r) * src_bi; \
  71. res5_i OP3## = src_a1i * src_br; \
  72. \
  73. /* 3rd col */ \
  74. SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
  75. res6_r OP0## = src_a0r * src_br; \
  76. res6_r OP1## = src_a0i * src_bi; \
  77. res6_i OP2## = (OP4 src_a0r) * src_bi; \
  78. res6_i OP3## = src_a0i * src_br; \
  79. \
  80. res7_r OP0## = src_a1r * src_br; \
  81. res7_r OP1## = src_a1i * src_bi; \
  82. res7_i OP2## = (OP4 src_a1r) * src_bi; \
  83. res7_i OP3## = src_a1i * src_br; \
  84. }
  85. #define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \
  86. { \
  87. LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
  88. src_b0 = LD_SP(pb0); \
  89. \
  90. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  91. PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
  92. \
  93. /* 0th col */ \
  94. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  95. res0_r OP0## = src_a0r * src_br; \
  96. res0_r OP1## = src_a0i * src_bi; \
  97. res0_i OP2## = (OP4 src_a0r) * src_bi; \
  98. res0_i OP3## = src_a0i * src_br; \
  99. \
  100. res1_r OP0## = src_a1r * src_br; \
  101. res1_r OP1## = src_a1i * src_bi; \
  102. res1_i OP2## = (OP4 src_a1r) * src_bi; \
  103. res1_i OP3## = src_a1i * src_br; \
  104. \
  105. /* 1st col */ \
  106. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  107. res2_r OP0## = src_a0r * src_br; \
  108. res2_r OP1## = src_a0i * src_bi; \
  109. res2_i OP2## = (OP4 src_a0r) * src_bi; \
  110. res2_i OP3## = src_a0i * src_br; \
  111. \
  112. res3_r OP0## = src_a1r * src_br; \
  113. res3_r OP1## = src_a1i * src_bi; \
  114. res3_i OP2## = (OP4 src_a1r) * src_bi; \
  115. res3_i OP3## = src_a1i * src_br; \
  116. }
  117. #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \
  118. { \
  119. LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
  120. src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
  121. SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
  122. \
  123. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  124. PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
  125. \
  126. /* 0th col */ \
  127. res0_r OP0## = src_a0r * src_br; \
  128. res0_r OP1## = src_a0i * src_bi; \
  129. res0_i OP2## = (OP4 src_a0r) * src_bi; \
  130. res0_i OP3## = src_a0i * src_br; \
  131. \
  132. res1_r OP0## = src_a1r * src_br; \
  133. res1_r OP1## = src_a1i * src_bi; \
  134. res1_i OP2## = (OP4 src_a1r) * src_bi; \
  135. res1_i OP3## = src_a1i * src_br; \
  136. }
  137. #define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
  138. { \
  139. LD_SP2_INC(pa0, 4, src_a0, src_a1); \
  140. LD_SP2_INC(pb0, 4, src_b0, src_b1); \
  141. \
  142. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  143. \
  144. /* 0th col */ \
  145. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  146. res0_r OP0## = src_a0r * src_br; \
  147. res0_r OP1## = src_a0i * src_bi; \
  148. res0_i OP2## = OP4 src_a0r * src_bi; \
  149. res0_i OP3## = src_a0i * src_br; \
  150. \
  151. /* 1st col */ \
  152. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  153. res2_r OP0## = src_a0r * src_br; \
  154. res2_r OP1## = src_a0i * src_bi; \
  155. res2_i OP2## = OP4 src_a0r * src_bi; \
  156. res2_i OP3## = src_a0i * src_br; \
  157. \
  158. /* 2nd col */ \
  159. SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
  160. res4_r OP0## = src_a0r * src_br; \
  161. res4_r OP1## = src_a0i * src_bi; \
  162. res4_i OP2## = OP4 src_a0r * src_bi; \
  163. res4_i OP3## = src_a0i * src_br; \
  164. \
  165. /* 3rd col */ \
  166. SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
  167. res6_r OP0## = src_a0r * src_br; \
  168. res6_r OP1## = src_a0i * src_bi; \
  169. res6_i OP2## = OP4 src_a0r * src_bi; \
  170. res6_i OP3## = src_a0i * src_br; \
  171. }
  172. #define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
  173. { \
  174. LD_SP2_INC(pa0, 4, src_a0, src_a1); \
  175. src_b0 = LD_SP(pb0); \
  176. \
  177. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  178. \
  179. /* 0th col */ \
  180. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  181. res0_r OP0## = src_a0r * src_br; \
  182. res0_r OP1## = src_a0i * src_bi; \
  183. res0_i OP2## = OP4 src_a0r * src_bi; \
  184. res0_i OP3## = src_a0i * src_br; \
  185. \
  186. /* 1st col */ \
  187. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  188. res2_r OP0## = src_a0r * src_br; \
  189. res2_r OP1## = src_a0i * src_bi; \
  190. res2_i OP2## = OP4 src_a0r * src_bi; \
  191. res2_i OP3## = src_a0i * src_br; \
  192. }
  193. #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
  194. { \
  195. LD_SP2_INC(pa0, 4, src_a0, src_a1); \
  196. src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
  197. SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
  198. \
  199. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  200. \
  201. /* 0th col */ \
  202. res0_r OP0## = src_a0r * src_br; \
  203. res0_r OP1## = src_a0i * src_bi; \
  204. res0_i OP2## = OP4 src_a0r * src_bi; \
  205. res0_i OP3## = src_a0i * src_br; \
  206. }
  207. #define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \
  208. { \
  209. a0_r = pa0[0]; \
  210. a0_i = pa0[1]; \
  211. b0_r = pb0[0]; \
  212. b0_i = pb0[1]; \
  213. \
  214. res0 OP0## = a0_r * b0_r; \
  215. res0 OP1## = a0_i * b0_i; \
  216. res1 OP2## = OP4 a0_r * b0_i; \
  217. res1 OP3## = a0_i * b0_r; \
  218. \
  219. a1_r = pa0[2]; \
  220. a1_i = pa0[3]; \
  221. res2 OP0## = a1_r * b0_r; \
  222. res2 OP1## = a1_i * b0_i; \
  223. res3 OP2## = OP4 a1_r * b0_i; \
  224. res3 OP3## = a1_i * b0_r; \
  225. \
  226. /* 1st col */ \
  227. b1_r = pb0[2]; \
  228. b1_i = pb0[3]; \
  229. res4 OP0## = a0_r * b1_r; \
  230. res4 OP1## = a0_i * b1_i; \
  231. res5 OP2## = OP4 a0_r * b1_i; \
  232. res5 OP3## = a0_i * b1_r; \
  233. \
  234. res6 OP0## = a1_r * b1_r; \
  235. res6 OP1## = a1_i * b1_i; \
  236. res7 OP2## = OP4 a1_r * b1_i; \
  237. res7 OP3## = a1_i * b1_r; \
  238. \
  239. /* 2nd col */ \
  240. b2_r = pb0[4]; \
  241. b2_i = pb0[5]; \
  242. res8 OP0## = a0_r * b2_r; \
  243. res8 OP1## = a0_i * b2_i; \
  244. res9 OP2## = OP4 a0_r * b2_i; \
  245. res9 OP3## = a0_i * b2_r; \
  246. \
  247. res10 OP0## = a1_r * b2_r; \
  248. res10 OP1## = a1_i * b2_i; \
  249. res11 OP2## = OP4 a1_r * b2_i; \
  250. res11 OP3## = a1_i * b2_r; \
  251. \
  252. /* 3rd col */ \
  253. b3_r = pb0[6]; \
  254. b3_i = pb0[7]; \
  255. res12 OP0## = a0_r * b3_r; \
  256. res12 OP1## = a0_i * b3_i; \
  257. res13 OP2## = OP4 a0_r * b3_i; \
  258. res13 OP3## = a0_i * b3_r; \
  259. \
  260. res14 OP0## = a1_r * b3_r; \
  261. res14 OP1## = a1_i * b3_i; \
  262. res15 OP2## = OP4 a1_r * b3_i; \
  263. res15 OP3## = a1_i * b3_r; \
  264. }
  265. #define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \
  266. { \
  267. /* 0th col */ \
  268. a0_r = pa0[0]; \
  269. a0_i = pa0[1]; \
  270. b0_r = pb0[0]; \
  271. b0_i = pb0[1]; \
  272. \
  273. res0 OP0## = a0_r * b0_r; \
  274. res0 OP1## = a0_i * b0_i; \
  275. res1 OP2## = OP4 a0_r * b0_i; \
  276. res1 OP3## = a0_i * b0_r; \
  277. \
  278. a1_r = pa0[2]; \
  279. a1_i = pa0[3]; \
  280. res2 OP0## = a1_r * b0_r; \
  281. res2 OP1## = a1_i * b0_i; \
  282. res3 OP2## = OP4 a1_r * b0_i; \
  283. res3 OP3## = a1_i * b0_r; \
  284. \
  285. /* 1st col */ \
  286. b1_r = pb0[2]; \
  287. b1_i = pb0[3]; \
  288. res4 OP0## = a0_r * b1_r; \
  289. res4 OP1## = a0_i * b1_i; \
  290. res5 OP2## = OP4 a0_r * b1_i; \
  291. res5 OP3## = a0_i * b1_r; \
  292. \
  293. res6 OP0## = a1_r * b1_r; \
  294. res6 OP1## = a1_i * b1_i; \
  295. res7 OP2## = OP4 a1_r * b1_i; \
  296. res7 OP3## = a1_i * b1_r; \
  297. }
  298. #define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \
  299. { \
  300. /* 0th col */ \
  301. a0_r = pa0[0]; \
  302. a0_i = pa0[1]; \
  303. b0_r = pb0[0]; \
  304. b0_i = pb0[1]; \
  305. \
  306. res0 OP0## = a0_r * b0_r; \
  307. res0 OP1## = a0_i * b0_i; \
  308. res1 OP2## = OP4 a0_r * b0_i; \
  309. res1 OP3## = a0_i * b0_r; \
  310. \
  311. a1_r = pa0[2]; \
  312. a1_i = pa0[3]; \
  313. res2 OP0## = a1_r * b0_r; \
  314. res2 OP1## = a1_i * b0_i; \
  315. res3 OP2## = OP4 a1_r * b0_i; \
  316. res3 OP3## = a1_i * b0_r; \
  317. }
  318. #define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \
  319. { \
  320. /* 0th col */ \
  321. a0_r = pa0[0]; \
  322. a0_i = pa0[1]; \
  323. b0_r = pb0[0]; \
  324. b0_i = pb0[1]; \
  325. \
  326. res0 OP0## = a0_r * b0_r; \
  327. res0 OP1## = a0_i * b0_i; \
  328. res1 OP2## = OP4 a0_r * b0_i; \
  329. res1 OP3## = a0_i * b0_r; \
  330. \
  331. /* 1st col */ \
  332. b1_r = pb0[2]; \
  333. b1_i = pb0[3]; \
  334. res2 OP0## = a0_r * b1_r; \
  335. res2 OP1## = a0_i * b1_i; \
  336. res3 OP2## = OP4 a0_r * b1_i; \
  337. res3 OP3## = a0_i * b1_r; \
  338. \
  339. /* 2nd col */ \
  340. b2_r = pb0[4]; \
  341. b2_i = pb0[5]; \
  342. res4 OP0## = a0_r * b2_r; \
  343. res4 OP1## = a0_i * b2_i; \
  344. res5 OP2## = OP4 a0_r * b2_i; \
  345. res5 OP3## = a0_i * b2_r; \
  346. \
  347. /* 3rd col */ \
  348. b3_r = pb0[6]; \
  349. b3_i = pb0[7]; \
  350. res6 OP0## = a0_r * b3_r; \
  351. res6 OP1## = a0_i * b3_i; \
  352. res7 OP2## = OP4 a0_r * b3_i; \
  353. res7 OP3## = a0_i * b3_r; \
  354. }
  355. #define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \
  356. { \
  357. /* 0th col */ \
  358. a0_r = pa0[0]; \
  359. a0_i = pa0[1]; \
  360. b0_r = pb0[0]; \
  361. b0_i = pb0[1]; \
  362. \
  363. res0 OP0## = a0_r * b0_r; \
  364. res0 OP1## = a0_i * b0_i; \
  365. res1 OP2## = OP4 a0_r * b0_i; \
  366. res1 OP3## = a0_i * b0_r; \
  367. \
  368. /* 1st col */ \
  369. b1_r = pb0[2]; \
  370. b1_i = pb0[3]; \
  371. res2 OP0## = a0_r * b1_r; \
  372. res2 OP1## = a0_i * b1_i; \
  373. res3 OP2## = OP4 a0_r * b1_i; \
  374. res3 OP3## = a0_i * b1_r; \
  375. }
  376. #define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
  377. { \
  378. /* 0th col */ \
  379. a0_r = pa0[0]; \
  380. a0_i = pa0[1]; \
  381. b0_r = pb0[0]; \
  382. b0_i = pb0[1]; \
  383. \
  384. res0 OP0## = a0_r * b0_r; \
  385. res0 OP1## = a0_i * b0_i; \
  386. res1 OP2## = OP4 a0_r * b0_i; \
  387. res1 OP3## = a0_i * b0_r; \
  388. }
  389. #define CGEMM_SCALE_8X4_MSA \
  390. { \
  391. LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
  392. \
  393. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  394. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  395. \
  396. dst0_r += alpha_r * res0_r; \
  397. dst0_r -= alpha_i * res0_i; \
  398. dst0_i += alpha_r * res0_i; \
  399. dst0_i += alpha_i * res0_r; \
  400. \
  401. dst1_r += alpha_r * res1_r; \
  402. dst1_r -= alpha_i * res1_i; \
  403. dst1_i += alpha_r * res1_i; \
  404. dst1_i += alpha_i * res1_r; \
  405. \
  406. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  407. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  408. \
  409. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  410. \
  411. LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
  412. \
  413. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  414. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  415. \
  416. dst0_r += alpha_r * res2_r; \
  417. dst0_r -= alpha_i * res2_i; \
  418. dst0_i += alpha_r * res2_i; \
  419. dst0_i += alpha_i * res2_r; \
  420. \
  421. dst1_r += alpha_r * res3_r; \
  422. dst1_r -= alpha_i * res3_i; \
  423. dst1_i += alpha_r * res3_i; \
  424. dst1_i += alpha_i * res3_r; \
  425. \
  426. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  427. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  428. \
  429. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  430. \
  431. LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \
  432. \
  433. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  434. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  435. \
  436. dst0_r += alpha_r * res4_r; \
  437. dst0_r -= alpha_i * res4_i; \
  438. dst0_i += alpha_r * res4_i; \
  439. dst0_i += alpha_i * res4_r; \
  440. \
  441. dst1_r += alpha_r * res5_r; \
  442. dst1_r -= alpha_i * res5_i; \
  443. dst1_i += alpha_r * res5_i; \
  444. dst1_i += alpha_i * res5_r; \
  445. \
  446. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  447. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  448. \
  449. ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
  450. \
  451. LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \
  452. \
  453. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  454. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  455. \
  456. dst0_r += alpha_r * res6_r; \
  457. dst0_r -= alpha_i * res6_i; \
  458. dst0_i += alpha_r * res6_i; \
  459. dst0_i += alpha_i * res6_r; \
  460. \
  461. dst1_r += alpha_r * res7_r; \
  462. dst1_r -= alpha_i * res7_i; \
  463. dst1_i += alpha_r * res7_i; \
  464. dst1_i += alpha_i * res7_r; \
  465. \
  466. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  467. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  468. \
  469. ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
  470. }
  471. #define CGEMM_SCALE_8X2_MSA \
  472. { \
  473. LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
  474. \
  475. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  476. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  477. \
  478. dst0_r += alpha_r * res0_r; \
  479. dst0_r -= alpha_i * res0_i; \
  480. dst0_i += alpha_r * res0_i; \
  481. dst0_i += alpha_i * res0_r; \
  482. \
  483. dst1_r += alpha_r * res1_r; \
  484. dst1_r -= alpha_i * res1_i; \
  485. dst1_i += alpha_r * res1_i; \
  486. dst1_i += alpha_i * res1_r; \
  487. \
  488. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  489. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  490. \
  491. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  492. \
  493. LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
  494. \
  495. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  496. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  497. \
  498. dst0_r += alpha_r * res2_r; \
  499. dst0_r -= alpha_i * res2_i; \
  500. dst0_i += alpha_r * res2_i; \
  501. dst0_i += alpha_i * res2_r; \
  502. \
  503. dst1_r += alpha_r * res3_r; \
  504. dst1_r -= alpha_i * res3_i; \
  505. dst1_i += alpha_r * res3_i; \
  506. dst1_i += alpha_i * res3_r; \
  507. \
  508. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  509. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  510. \
  511. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  512. }
  513. #define CGEMM_SCALE_8X1_MSA \
  514. { \
  515. LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
  516. \
  517. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  518. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  519. \
  520. dst0_r += alpha_r * res0_r; \
  521. dst0_r -= alpha_i * res0_i; \
  522. dst0_i += alpha_r * res0_i; \
  523. dst0_i += alpha_i * res0_r; \
  524. \
  525. dst1_r += alpha_r * res1_r; \
  526. dst1_r -= alpha_i * res1_i; \
  527. dst1_i += alpha_r * res1_i; \
  528. dst1_i += alpha_i * res1_r; \
  529. \
  530. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  531. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  532. \
  533. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  534. }
  535. #define CGEMM_SCALE_4X4_MSA \
  536. { \
  537. LD_SP2(pc0, 4, dst0, dst1); \
  538. \
  539. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  540. \
  541. dst0_r += alpha_r * res0_r; \
  542. dst0_r -= alpha_i * res0_i; \
  543. dst0_i += alpha_r * res0_i; \
  544. dst0_i += alpha_i * res0_r; \
  545. \
  546. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  547. \
  548. ST_SP2_INC(dst0, dst1, pc0, 4); \
  549. \
  550. LD_SP2(pc1, 4, dst0, dst1); \
  551. \
  552. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  553. \
  554. dst0_r += alpha_r * res2_r; \
  555. dst0_r -= alpha_i * res2_i; \
  556. dst0_i += alpha_r * res2_i; \
  557. dst0_i += alpha_i * res2_r; \
  558. \
  559. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  560. \
  561. ST_SP2_INC(dst0, dst1, pc1, 4); \
  562. \
  563. LD_SP2(pc2, 4, dst0, dst1); \
  564. \
  565. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  566. \
  567. dst0_r += alpha_r * res4_r; \
  568. dst0_r -= alpha_i * res4_i; \
  569. dst0_i += alpha_r * res4_i; \
  570. dst0_i += alpha_i * res4_r; \
  571. \
  572. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  573. \
  574. ST_SP2_INC(dst0, dst1, pc2, 4); \
  575. \
  576. LD_SP2(pc3, 4, dst0, dst1); \
  577. \
  578. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  579. \
  580. dst0_r += alpha_r * res6_r; \
  581. dst0_r -= alpha_i * res6_i; \
  582. dst0_i += alpha_r * res6_i; \
  583. dst0_i += alpha_i * res6_r; \
  584. \
  585. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  586. \
  587. ST_SP2_INC(dst0, dst1, pc3, 4); \
  588. }
  589. #define CGEMM_SCALE_4X2_MSA \
  590. { \
  591. LD_SP2(pc0, 4, dst0, dst1); \
  592. \
  593. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  594. \
  595. dst0_r += alpha_r * res0_r; \
  596. dst0_r -= alpha_i * res0_i; \
  597. dst0_i += alpha_r * res0_i; \
  598. dst0_i += alpha_i * res0_r; \
  599. \
  600. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  601. \
  602. ST_SP2_INC(dst0, dst1, pc0, 4); \
  603. \
  604. LD_SP2(pc1, 4, dst0, dst1); \
  605. \
  606. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  607. \
  608. dst0_r += alpha_r * res2_r; \
  609. dst0_r -= alpha_i * res2_i; \
  610. dst0_i += alpha_r * res2_i; \
  611. dst0_i += alpha_i * res2_r; \
  612. \
  613. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  614. \
  615. ST_SP2_INC(dst0, dst1, pc1, 4); \
  616. }
  617. #define CGEMM_SCALE_4X1_MSA \
  618. { \
  619. LD_SP2(pc0, 4, dst0, dst1); \
  620. \
  621. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  622. \
  623. dst0_r += alpha_r * res0_r; \
  624. dst0_r -= alpha_i * res0_i; \
  625. dst0_i += alpha_r * res0_i; \
  626. dst0_i += alpha_i * res0_r; \
  627. \
  628. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  629. \
  630. ST_SP2_INC(dst0, dst1, pc0, 4); \
  631. }
  632. #define CGEMM_SCALE_2X4 \
  633. { \
  634. /* 0th col */ \
  635. pc0[0] += alphar * res0; \
  636. pc0[0] -= alphai * res1; \
  637. pc0[1] += alphar * res1; \
  638. pc0[1] += alphai * res0; \
  639. pc0[2] += alphar * res2; \
  640. pc0[2] -= alphai * res3; \
  641. pc0[3] += alphar * res3; \
  642. pc0[3] += alphai * res2; \
  643. \
  644. /* 1st col */ \
  645. pc1[0] += alphar * res4; \
  646. pc1[0] -= alphai * res5; \
  647. pc1[1] += alphar * res5; \
  648. pc1[1] += alphai * res4; \
  649. pc1[2] += alphar * res6; \
  650. pc1[2] -= alphai * res7; \
  651. pc1[3] += alphar * res7; \
  652. pc1[3] += alphai * res6; \
  653. \
  654. /* 2nd col */ \
  655. pc2[0] += alphar * res8; \
  656. pc2[0] -= alphai * res9; \
  657. pc2[1] += alphar * res9; \
  658. pc2[1] += alphai * res8; \
  659. pc2[2] += alphar * res10; \
  660. pc2[2] -= alphai * res11; \
  661. pc2[3] += alphar * res11; \
  662. pc2[3] += alphai * res10; \
  663. \
  664. /* 3rd col */ \
  665. pc3[0] += alphar * res12; \
  666. pc3[0] -= alphai * res13; \
  667. pc3[1] += alphar * res13; \
  668. pc3[1] += alphai * res12; \
  669. pc3[2] += alphar * res14; \
  670. pc3[2] -= alphai * res15; \
  671. pc3[3] += alphar * res15; \
  672. pc3[3] += alphai * res14; \
  673. }
  674. #define CGEMM_SCALE_2X2 \
  675. { \
  676. /* 0th col */ \
  677. pc0[0] += alphar * res0; \
  678. pc0[0] -= alphai * res1; \
  679. pc0[1] += alphar * res1; \
  680. pc0[1] += alphai * res0; \
  681. pc0[2] += alphar * res2; \
  682. pc0[2] -= alphai * res3; \
  683. pc0[3] += alphar * res3; \
  684. pc0[3] += alphai * res2; \
  685. \
  686. /* 1st col */ \
  687. pc1[0] += alphar * res4; \
  688. pc1[0] -= alphai * res5; \
  689. pc1[1] += alphar * res5; \
  690. pc1[1] += alphai * res4; \
  691. pc1[2] += alphar * res6; \
  692. pc1[2] -= alphai * res7; \
  693. pc1[3] += alphar * res7; \
  694. pc1[3] += alphai * res6; \
  695. }
  696. #define CGEMM_SCALE_2X1 \
  697. { \
  698. pc0[0] += alphar * res0; \
  699. pc0[0] -= alphai * res1; \
  700. pc0[1] += alphar * res1; \
  701. pc0[1] += alphai * res0; \
  702. \
  703. pc0[2] += alphar * res2; \
  704. pc0[2] -= alphai * res3; \
  705. pc0[3] += alphar * res3; \
  706. pc0[3] += alphai * res2; \
  707. }
  708. #define CGEMM_SCALE_1X4 \
  709. { \
  710. pc0[0] += alphar * res0; \
  711. pc0[0] -= alphai * res1; \
  712. pc0[1] += alphar * res1; \
  713. pc0[1] += alphai * res0; \
  714. \
  715. pc1[0] += alphar * res2; \
  716. pc1[0] -= alphai * res3; \
  717. pc1[1] += alphar * res3; \
  718. pc1[1] += alphai * res2; \
  719. \
  720. pc2[0] += alphar * res4; \
  721. pc2[0] -= alphai * res5; \
  722. pc2[1] += alphar * res5; \
  723. pc2[1] += alphai * res4; \
  724. \
  725. pc3[0] += alphar * res6; \
  726. pc3[0] -= alphai * res7; \
  727. pc3[1] += alphar * res7; \
  728. pc3[1] += alphai * res6; \
  729. }
  730. #define CGEMM_SCALE_1X2 \
  731. { \
  732. pc0[0] += alphar * res0; \
  733. pc0[0] -= alphai * res1; \
  734. pc0[1] += alphar * res1; \
  735. pc0[1] += alphai * res0; \
  736. \
  737. pc1[2] += alphar * res2; \
  738. pc1[2] -= alphai * res3; \
  739. pc1[3] += alphar * res3; \
  740. pc1[3] += alphai * res2; \
  741. }
  742. #define CGEMM_SCALE_1X1 \
  743. { \
  744. pc0[0] += alphar * res0; \
  745. pc0[0] -= alphai * res1; \
  746. pc0[1] += alphar * res1; \
  747. pc0[1] += alphai * res0; \
  748. }
  749. #define CGEMM_TRMM_SCALE_8X4_MSA \
  750. { \
  751. dst0_r = alpha_r * res0_r; \
  752. dst0_r -= alpha_i * res0_i; \
  753. dst0_i = alpha_r * res0_i; \
  754. dst0_i += alpha_i * res0_r; \
  755. \
  756. dst1_r = alpha_r * res1_r; \
  757. dst1_r -= alpha_i * res1_i; \
  758. dst1_i = alpha_r * res1_i; \
  759. dst1_i += alpha_i * res1_r; \
  760. \
  761. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  762. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  763. \
  764. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  765. \
  766. dst0_r = alpha_r * res2_r; \
  767. dst0_r -= alpha_i * res2_i; \
  768. dst0_i = alpha_r * res2_i; \
  769. dst0_i += alpha_i * res2_r; \
  770. \
  771. dst1_r = alpha_r * res3_r; \
  772. dst1_r -= alpha_i * res3_i; \
  773. dst1_i = alpha_r * res3_i; \
  774. dst1_i += alpha_i * res3_r; \
  775. \
  776. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  777. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  778. \
  779. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  780. \
  781. dst0_r = alpha_r * res4_r; \
  782. dst0_r -= alpha_i * res4_i; \
  783. dst0_i = alpha_r * res4_i; \
  784. dst0_i += alpha_i * res4_r; \
  785. \
  786. dst1_r = alpha_r * res5_r; \
  787. dst1_r -= alpha_i * res5_i; \
  788. dst1_i = alpha_r * res5_i; \
  789. dst1_i += alpha_i * res5_r; \
  790. \
  791. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  792. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  793. \
  794. ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
  795. \
  796. dst0_r = alpha_r * res6_r; \
  797. dst0_r -= alpha_i * res6_i; \
  798. dst0_i = alpha_r * res6_i; \
  799. dst0_i += alpha_i * res6_r; \
  800. \
  801. dst1_r = alpha_r * res7_r; \
  802. dst1_r -= alpha_i * res7_i; \
  803. dst1_i = alpha_r * res7_i; \
  804. dst1_i += alpha_i * res7_r; \
  805. \
  806. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  807. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  808. \
  809. ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
  810. }
  811. #define CGEMM_TRMM_SCALE_8X2_MSA \
  812. { \
  813. dst0_r = alpha_r * res0_r; \
  814. dst0_r -= alpha_i * res0_i; \
  815. dst0_i = alpha_r * res0_i; \
  816. dst0_i += alpha_i * res0_r; \
  817. \
  818. dst1_r = alpha_r * res1_r; \
  819. dst1_r -= alpha_i * res1_i; \
  820. dst1_i = alpha_r * res1_i; \
  821. dst1_i += alpha_i * res1_r; \
  822. \
  823. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  824. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  825. \
  826. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  827. \
  828. dst0_r = alpha_r * res2_r; \
  829. dst0_r -= alpha_i * res2_i; \
  830. dst0_i = alpha_r * res2_i; \
  831. dst0_i += alpha_i * res2_r; \
  832. \
  833. dst1_r = alpha_r * res3_r; \
  834. dst1_r -= alpha_i * res3_i; \
  835. dst1_i = alpha_r * res3_i; \
  836. dst1_i += alpha_i * res3_r; \
  837. \
  838. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  839. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  840. \
  841. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  842. }
  843. #define CGEMM_TRMM_SCALE_8X1_MSA \
  844. { \
  845. dst0_r = alpha_r * res0_r; \
  846. dst0_r -= alpha_i * res0_i; \
  847. dst0_i = alpha_r * res0_i; \
  848. dst0_i += alpha_i * res0_r; \
  849. \
  850. dst1_r = alpha_r * res1_r; \
  851. dst1_r -= alpha_i * res1_i; \
  852. dst1_i = alpha_r * res1_i; \
  853. dst1_i += alpha_i * res1_r; \
  854. \
  855. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  856. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  857. \
  858. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  859. }
  860. #define CGEMM_TRMM_SCALE_4X4_MSA \
  861. { \
  862. dst0_r = alpha_r * res0_r; \
  863. dst0_r -= alpha_i * res0_i; \
  864. dst0_i = alpha_r * res0_i; \
  865. dst0_i += alpha_i * res0_r; \
  866. \
  867. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  868. \
  869. ST_SP2_INC(dst0, dst1, pc0, 4); \
  870. \
  871. dst0_r = alpha_r * res2_r; \
  872. dst0_r -= alpha_i * res2_i; \
  873. dst0_i = alpha_r * res2_i; \
  874. dst0_i += alpha_i * res2_r; \
  875. \
  876. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  877. \
  878. ST_SP2_INC(dst0, dst1, pc1, 4); \
  879. \
  880. dst0_r = alpha_r * res4_r; \
  881. dst0_r -= alpha_i * res4_i; \
  882. dst0_i = alpha_r * res4_i; \
  883. dst0_i += alpha_i * res4_r; \
  884. \
  885. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  886. \
  887. ST_SP2_INC(dst0, dst1, pc2, 4); \
  888. \
  889. dst0_r = alpha_r * res6_r; \
  890. dst0_r -= alpha_i * res6_i; \
  891. dst0_i = alpha_r * res6_i; \
  892. dst0_i += alpha_i * res6_r; \
  893. \
  894. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  895. \
  896. ST_SP2_INC(dst0, dst1, pc3, 4); \
  897. }
  898. #define CGEMM_TRMM_SCALE_4X2_MSA \
  899. { \
  900. dst0_r = alpha_r * res0_r; \
  901. dst0_r -= alpha_i * res0_i; \
  902. dst0_i = alpha_r * res0_i; \
  903. dst0_i += alpha_i * res0_r; \
  904. \
  905. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  906. \
  907. ST_SP2_INC(dst0, dst1, pc0, 4); \
  908. \
  909. dst0_r = alpha_r * res2_r; \
  910. dst0_r -= alpha_i * res2_i; \
  911. dst0_i = alpha_r * res2_i; \
  912. dst0_i += alpha_i * res2_r; \
  913. \
  914. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  915. \
  916. ST_SP2_INC(dst0, dst1, pc1, 4); \
  917. }
  918. #define CGEMM_TRMM_SCALE_4X1_MSA \
  919. { \
  920. dst0_r = alpha_r * res0_r; \
  921. dst0_r -= alpha_i * res0_i; \
  922. dst0_i = alpha_r * res0_i; \
  923. dst0_i += alpha_i * res0_r; \
  924. \
  925. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  926. \
  927. ST_SP2_INC(dst0, dst1, pc0, 4); \
  928. }
  929. #define CGEMM_TRMM_SCALE_2X4 \
  930. { \
  931. /* 0th col */ \
  932. pc0[0] = alphar * res0; \
  933. pc0[0] -= alphai * res1; \
  934. pc0[1] = alphar * res1; \
  935. pc0[1] += alphai * res0; \
  936. pc0[2] = alphar * res2; \
  937. pc0[2] -= alphai * res3; \
  938. pc0[3] = alphar * res3; \
  939. pc0[3] += alphai * res2; \
  940. \
  941. /* 1st col */ \
  942. pc1[0] = alphar * res4; \
  943. pc1[0] -= alphai * res5; \
  944. pc1[1] = alphar * res5; \
  945. pc1[1] += alphai * res4; \
  946. pc1[2] = alphar * res6; \
  947. pc1[2] -= alphai * res7; \
  948. pc1[3] = alphar * res7; \
  949. pc1[3] += alphai * res6; \
  950. \
  951. /* 2nd col */ \
  952. pc2[0] = alphar * res8; \
  953. pc2[0] -= alphai * res9; \
  954. pc2[1] = alphar * res9; \
  955. pc2[1] += alphai * res8; \
  956. pc2[2] = alphar * res10; \
  957. pc2[2] -= alphai * res11; \
  958. pc2[3] = alphar * res11; \
  959. pc2[3] += alphai * res10; \
  960. \
  961. /* 3rd col */ \
  962. pc3[0] = alphar * res12; \
  963. pc3[0] -= alphai * res13; \
  964. pc3[1] = alphar * res13; \
  965. pc3[1] += alphai * res12; \
  966. pc3[2] = alphar * res14; \
  967. pc3[2] -= alphai * res15; \
  968. pc3[3] = alphar * res15; \
  969. pc3[3] += alphai * res14; \
  970. }
  971. #define CGEMM_TRMM_SCALE_2X2 \
  972. { \
  973. /* 0th col */ \
  974. pc0[0] = alphar * res0; \
  975. pc0[0] -= alphai * res1; \
  976. pc0[1] = alphar * res1; \
  977. pc0[1] += alphai * res0; \
  978. pc0[2] = alphar * res2; \
  979. pc0[2] -= alphai * res3; \
  980. pc0[3] = alphar * res3; \
  981. pc0[3] += alphai * res2; \
  982. \
  983. /* 1st col */ \
  984. pc1[0] = alphar * res4; \
  985. pc1[0] -= alphai * res5; \
  986. pc1[1] = alphar * res5; \
  987. pc1[1] += alphai * res4; \
  988. pc1[2] = alphar * res6; \
  989. pc1[2] -= alphai * res7; \
  990. pc1[3] = alphar * res7; \
  991. pc1[3] += alphai * res6; \
  992. }
  993. #define CGEMM_TRMM_SCALE_2X1 \
  994. { \
  995. pc0[0] = alphar * res0; \
  996. pc0[0] -= alphai * res1; \
  997. pc0[1] = alphar * res1; \
  998. pc0[1] += alphai * res0; \
  999. \
  1000. pc0[2] = alphar * res2; \
  1001. pc0[2] -= alphai * res3; \
  1002. pc0[3] = alphar * res3; \
  1003. pc0[3] += alphai * res2; \
  1004. }
  1005. #define CGEMM_TRMM_SCALE_1X4 \
  1006. { \
  1007. pc0[0] = alphar * res0; \
  1008. pc0[0] -= alphai * res1; \
  1009. pc0[1] = alphar * res1; \
  1010. pc0[1] += alphai * res0; \
  1011. \
  1012. pc1[0] = alphar * res2; \
  1013. pc1[0] -= alphai * res3; \
  1014. pc1[1] = alphar * res3; \
  1015. pc1[1] += alphai * res2; \
  1016. \
  1017. pc2[0] = alphar * res4; \
  1018. pc2[0] -= alphai * res5; \
  1019. pc2[1] = alphar * res5; \
  1020. pc2[1] += alphai * res4; \
  1021. \
  1022. pc3[0] = alphar * res6; \
  1023. pc3[0] -= alphai * res7; \
  1024. pc3[1] = alphar * res7; \
  1025. pc3[1] += alphai * res6; \
  1026. }
  1027. #define CGEMM_TRMM_SCALE_1X2 \
  1028. { \
  1029. pc0[0] = alphar * res0; \
  1030. pc0[0] -= alphai * res1; \
  1031. pc0[1] = alphar * res1; \
  1032. pc0[1] += alphai * res0; \
  1033. \
  1034. pc1[2] = alphar * res2; \
  1035. pc1[2] -= alphai * res3; \
  1036. pc1[3] = alphar * res3; \
  1037. pc1[3] += alphai * res2; \
  1038. }
  1039. #define CGEMM_TRMM_SCALE_1X1 \
  1040. { \
  1041. pc0[0] = alphar * res0; \
  1042. pc0[0] -= alphai * res1; \
  1043. pc0[1] = alphar * res1; \
  1044. pc0[1] += alphai * res0; \
  1045. }
  1046. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
  1047. FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc
  1048. #ifdef TRMMKERNEL
  1049. , BLASLONG offset
  1050. #endif
  1051. )
  1052. {
  1053. BLASLONG i, j, l, temp;
  1054. #if defined(TRMMKERNEL)
  1055. BLASLONG off;
  1056. #endif
  1057. FLOAT *pc0, *pc1, *pc2, *pc3;
  1058. FLOAT *pa0, *pb0;
  1059. FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
  1060. FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
  1061. FLOAT a0_r, a1_r;
  1062. FLOAT a0_i, a1_i;
  1063. FLOAT b0_r, b1_r, b2_r, b3_r;
  1064. FLOAT b0_i, b1_i, b2_i, b3_i;
  1065. v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
  1066. v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
  1067. v4f32 dst0, dst1, dst2, dst3;
  1068. v4f32 alpha_r, alpha_i;
  1069. v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
  1070. v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
  1071. v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
  1072. alpha_r = COPY_FLOAT_TO_VECTOR(alphar);
  1073. alpha_i = COPY_FLOAT_TO_VECTOR(alphai);
  1074. #if defined(TRMMKERNEL) && !defined(LEFT)
  1075. off = -offset;
  1076. #endif
  1077. for (j = (n >> 2); j--;)
  1078. {
  1079. pc0 = C;
  1080. pc1 = pc0 + 2 * ldc;
  1081. pc2 = pc1 + 2 * ldc;
  1082. pc3 = pc2 + 2 * ldc;
  1083. pa0 = A;
  1084. #if defined(TRMMKERNEL) && defined(LEFT)
  1085. off = offset;
  1086. #endif
  1087. for (i = (m >> 3); i--;)
  1088. {
  1089. #if defined(TRMMKERNEL)
  1090. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1091. pb0 = B;
  1092. #else
  1093. pa0 += off * 2 * 8;
  1094. pb0 = B + off * 2 * 4;
  1095. #endif
  1096. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1097. temp = k - off;
  1098. #elif defined(LEFT)
  1099. temp = off + 8; // number of values in A
  1100. #else
  1101. temp = off + 4; // number of values in B
  1102. #endif
  1103. #else
  1104. pb0 = B;
  1105. temp = k;
  1106. #endif
  1107. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1108. CGEMM_KERNEL_8X4_MSA(, -, , +, +);
  1109. #endif
  1110. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1111. CGEMM_KERNEL_8X4_MSA(, +, , +, -);
  1112. #endif
  1113. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1114. CGEMM_KERNEL_8X4_MSA(, +, , -, +);
  1115. #endif
  1116. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1117. CGEMM_KERNEL_8X4_MSA(, -, , -, -);
  1118. #endif
  1119. for (l = (temp - 1); l--;)
  1120. {
  1121. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1122. CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
  1123. #endif
  1124. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1125. CGEMM_KERNEL_8X4_MSA(+, +, -, +,);
  1126. #endif
  1127. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1128. CGEMM_KERNEL_8X4_MSA(+, +, +, -,);
  1129. #endif
  1130. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1131. CGEMM_KERNEL_8X4_MSA(+, -, -, -,);
  1132. #endif
  1133. }
  1134. #if defined(TRMMKERNEL)
  1135. CGEMM_TRMM_SCALE_8X4_MSA
  1136. #else
  1137. CGEMM_SCALE_8X4_MSA
  1138. #endif
  1139. #if defined(TRMMKERNEL)
  1140. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1141. temp = k - off;
  1142. #ifdef LEFT
  1143. temp -= 8; // number of values in A
  1144. #else
  1145. temp -= 4; // number of values in B
  1146. #endif
  1147. pa0 += temp * 2 * 8;
  1148. pb0 += temp * 2 * 4;
  1149. #endif
  1150. #ifdef LEFT
  1151. off += 8; // number of values in A
  1152. #endif
  1153. #endif
  1154. }
  1155. if (m & 4)
  1156. {
  1157. #if defined(TRMMKERNEL)
  1158. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1159. pb0 = B;
  1160. #else
  1161. pa0 += off * 2 * 4;
  1162. pb0 = B + off * 2 * 4;
  1163. #endif
  1164. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1165. temp = k - off;
  1166. #elif defined(LEFT)
  1167. temp = off + 4; // number of values in A
  1168. #else
  1169. temp = off + 4; // number of values in B
  1170. #endif
  1171. #else
  1172. pb0 = B;
  1173. temp = k;
  1174. #endif
  1175. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1176. CGEMM_KERNEL_4X4_MSA(, -, , +, +);
  1177. #endif
  1178. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1179. CGEMM_KERNEL_4X4_MSA(, +, , +, -);
  1180. #endif
  1181. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1182. CGEMM_KERNEL_4X4_MSA(, +, , -, +);
  1183. #endif
  1184. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1185. CGEMM_KERNEL_4X4_MSA(, -, , -, -);
  1186. #endif
  1187. for (l = (temp - 1); l--;)
  1188. {
  1189. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1190. CGEMM_KERNEL_4X4_MSA(+, -, +, +,);
  1191. #endif
  1192. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1193. CGEMM_KERNEL_4X4_MSA(+, +, -, +,);
  1194. #endif
  1195. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1196. CGEMM_KERNEL_4X4_MSA(+, +, +, -,);
  1197. #endif
  1198. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1199. CGEMM_KERNEL_4X4_MSA(+, -, -, -,);
  1200. #endif
  1201. }
  1202. #if defined(TRMMKERNEL)
  1203. CGEMM_TRMM_SCALE_4X4_MSA
  1204. #else
  1205. CGEMM_SCALE_4X4_MSA
  1206. #endif
  1207. #if defined(TRMMKERNEL)
  1208. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1209. temp = k - off;
  1210. #ifdef LEFT
  1211. temp -= 4; // number of values in A
  1212. #else
  1213. temp -= 4; // number of values in B
  1214. #endif
  1215. pa0 += temp * 2 * 4;
  1216. pb0 += temp * 2 * 4;
  1217. #endif
  1218. #ifdef LEFT
  1219. off += 4; // number of values in A
  1220. #endif
  1221. #endif
  1222. }
  1223. if (m & 2)
  1224. {
  1225. #if defined(TRMMKERNEL)
  1226. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1227. pb0 = B;
  1228. #else
  1229. pa0 += off * 2 * 2;
  1230. pb0 = B + off * 2 * 4;
  1231. #endif
  1232. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1233. temp = k - off;
  1234. #elif defined(LEFT)
  1235. temp = off + 2; // number of values in A
  1236. #else
  1237. temp = off + 4; // number of values in B
  1238. #endif
  1239. #else
  1240. pb0 = B;
  1241. temp = k;
  1242. #endif
  1243. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1244. CGEMM_KERNEL_2X4(, -, , +, +);
  1245. #endif
  1246. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1247. CGEMM_KERNEL_2X4(, +, , +, -);
  1248. #endif
  1249. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1250. CGEMM_KERNEL_2X4(, +, , -, +);
  1251. #endif
  1252. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1253. CGEMM_KERNEL_2X4(, -, , -, -);
  1254. #endif
  1255. pa0 += 4;
  1256. pb0 += 8;
  1257. for (l = (temp - 1); l--;)
  1258. {
  1259. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1260. CGEMM_KERNEL_2X4(+, -, +, +,);
  1261. #endif
  1262. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1263. CGEMM_KERNEL_2X4(+, +, -, +,);
  1264. #endif
  1265. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1266. CGEMM_KERNEL_2X4(+, +, +, -,);
  1267. #endif
  1268. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1269. CGEMM_KERNEL_2X4(+, -, -, -,);
  1270. #endif
  1271. pa0 += 4;
  1272. pb0 += 8;
  1273. }
  1274. #if defined(TRMMKERNEL)
  1275. CGEMM_TRMM_SCALE_2X4
  1276. #else
  1277. CGEMM_SCALE_2X4
  1278. #endif
  1279. #if defined(TRMMKERNEL)
  1280. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1281. temp = k - off;
  1282. #ifdef LEFT
  1283. temp -= 2; // number of values in A
  1284. #else
  1285. temp -= 4; // number of values in B
  1286. #endif
  1287. pa0 += temp * 2 * 2;
  1288. pb0 += temp * 2 * 4;
  1289. #endif
  1290. #ifdef LEFT
  1291. off += 2; // number of values in A
  1292. #endif
  1293. #endif
  1294. pc0 += 4;
  1295. pc1 += 4;
  1296. pc2 += 4;
  1297. pc3 += 4;
  1298. }
  1299. if (m & 1)
  1300. {
  1301. #if defined(TRMMKERNEL)
  1302. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1303. pb0 = B;
  1304. #else
  1305. pa0 += off * 2 * 1;
  1306. pb0 = B + off * 2 * 4;
  1307. #endif
  1308. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1309. temp = k - off;
  1310. #elif defined(LEFT)
  1311. temp = off + 1; // number of values in A
  1312. #else
  1313. temp = off + 4; // number of values in B
  1314. #endif
  1315. #else
  1316. pb0 = B;
  1317. temp = k;
  1318. #endif
  1319. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1320. CGEMM_KERNEL_1X4(, -, , +, +);
  1321. #endif
  1322. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1323. CGEMM_KERNEL_1X4(, +, , +, -);
  1324. #endif
  1325. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1326. CGEMM_KERNEL_1X4(, +, , -, +);
  1327. #endif
  1328. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1329. CGEMM_KERNEL_1X4(, -, , -, -);
  1330. #endif
  1331. pa0 += 2;
  1332. pb0 += 8;
  1333. for (l = (temp - 1); l--;)
  1334. {
  1335. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1336. CGEMM_KERNEL_1X4(+, -, +, +,);
  1337. #endif
  1338. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1339. CGEMM_KERNEL_1X4(+, +, -, +,);
  1340. #endif
  1341. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1342. CGEMM_KERNEL_1X4(+, +, +, -,);
  1343. #endif
  1344. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1345. CGEMM_KERNEL_1X4(+, -, -, -,);
  1346. #endif
  1347. pa0 += 2;
  1348. pb0 += 8;
  1349. }
  1350. #if defined(TRMMKERNEL)
  1351. CGEMM_TRMM_SCALE_1X4
  1352. #else
  1353. CGEMM_SCALE_1X4
  1354. #endif
  1355. #if defined(TRMMKERNEL)
  1356. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1357. temp = k - off;
  1358. #ifdef LEFT
  1359. temp -= 1; // number of values in A
  1360. #else
  1361. temp -= 4; // number of values in B
  1362. #endif
  1363. pa0 += temp * 2 * 1;
  1364. pb0 += temp * 2 * 4;
  1365. #endif
  1366. #ifdef LEFT
  1367. off += 1; // number of values in A
  1368. #endif
  1369. #endif
  1370. pc0 += 2;
  1371. pc1 += 2;
  1372. pc2 += 2;
  1373. pc3 += 2;
  1374. }
  1375. #if defined(TRMMKERNEL) && !defined(LEFT)
  1376. off += 4; // number of values in A
  1377. #endif
  1378. l = k << 3;
  1379. B = B + l;
  1380. i = ldc << 3;
  1381. C = C + i;
  1382. }
  1383. if (n & 2)
  1384. {
  1385. pc0 = C;
  1386. pc1 = pc0 + 2 * ldc;
  1387. pa0 = A;
  1388. #if defined(TRMMKERNEL) && defined(LEFT)
  1389. off = offset;
  1390. #endif
  1391. for (i = (m >> 3); i--;)
  1392. {
  1393. #if defined(TRMMKERNEL)
  1394. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1395. pb0 = B;
  1396. #else
  1397. pa0 += off * 2 * 8;
  1398. pb0 = B + off * 2 * 2;
  1399. #endif
  1400. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1401. temp = k - off;
  1402. #elif defined(LEFT)
  1403. temp = off + 8; // number of values in A
  1404. #else
  1405. temp = off + 2; // number of values in B
  1406. #endif
  1407. #else
  1408. pb0 = B;
  1409. temp = k;
  1410. #endif
  1411. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1412. CGEMM_KERNEL_8X2_MSA(, -, , +, +);
  1413. #endif
  1414. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1415. CGEMM_KERNEL_8X2_MSA(, +, , +, -);
  1416. #endif
  1417. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1418. CGEMM_KERNEL_8X2_MSA(, +, , -, +);
  1419. #endif
  1420. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1421. CGEMM_KERNEL_8X2_MSA(, -, , -, -);
  1422. #endif
  1423. pb0 += 4;
  1424. for (l = (temp - 1); l--;)
  1425. {
  1426. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1427. CGEMM_KERNEL_8X2_MSA(+, -, +, +,);
  1428. #endif
  1429. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1430. CGEMM_KERNEL_8X2_MSA(+, +, -, +,);
  1431. #endif
  1432. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1433. CGEMM_KERNEL_8X2_MSA(+, +, +, -,);
  1434. #endif
  1435. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1436. CGEMM_KERNEL_8X2_MSA(+, -, -, -,);
  1437. #endif
  1438. pb0 += 4;
  1439. }
  1440. #if defined(TRMMKERNEL)
  1441. CGEMM_TRMM_SCALE_8X2_MSA
  1442. #else
  1443. CGEMM_SCALE_8X2_MSA
  1444. #endif
  1445. #if defined(TRMMKERNEL)
  1446. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1447. temp = k - off;
  1448. #ifdef LEFT
  1449. temp -= 8; // number of values in A
  1450. #else
  1451. temp -= 2; // number of values in B
  1452. #endif
  1453. pa0 += temp * 2 * 8;
  1454. pb0 += temp * 2 * 2;
  1455. #endif
  1456. #ifdef LEFT
  1457. off += 8; // number of values in A
  1458. #endif
  1459. #endif
  1460. }
  1461. if (m & 4)
  1462. {
  1463. #if defined(TRMMKERNEL)
  1464. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1465. pb0 = B;
  1466. #else
  1467. pa0 += off * 2 * 4;
  1468. pb0 = B + off * 2 * 2;
  1469. #endif
  1470. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1471. temp = k - off;
  1472. #elif defined(LEFT)
  1473. temp = off + 4; // number of values in A
  1474. #else
  1475. temp = off + 2; // number of values in B
  1476. #endif
  1477. #else
  1478. pb0 = B;
  1479. temp = k;
  1480. #endif
  1481. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1482. CGEMM_KERNEL_4X2_MSA(, -, , +, +);
  1483. #endif
  1484. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1485. CGEMM_KERNEL_4X2_MSA(, +, , +, -);
  1486. #endif
  1487. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1488. CGEMM_KERNEL_4X2_MSA(, +, , -, +);
  1489. #endif
  1490. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1491. CGEMM_KERNEL_4X2_MSA(, -, , -, -);
  1492. #endif
  1493. pb0 += 4;
  1494. for (l = (temp - 1); l--;)
  1495. {
  1496. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1497. CGEMM_KERNEL_4X2_MSA(+, -, +, +,);
  1498. #endif
  1499. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1500. CGEMM_KERNEL_4X2_MSA(+, +, -, +,);
  1501. #endif
  1502. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1503. CGEMM_KERNEL_4X2_MSA(+, +, +, -,);
  1504. #endif
  1505. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1506. CGEMM_KERNEL_4X2_MSA(+, -, -, -,);
  1507. #endif
  1508. pb0 += 4;
  1509. }
  1510. #if defined(TRMMKERNEL)
  1511. CGEMM_TRMM_SCALE_4X2_MSA
  1512. #else
  1513. CGEMM_SCALE_4X2_MSA
  1514. #endif
  1515. #if defined(TRMMKERNEL)
  1516. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1517. temp = k - off;
  1518. #ifdef LEFT
  1519. temp -= 4; // number of values in A
  1520. #else
  1521. temp -= 2; // number of values in B
  1522. #endif
  1523. pa0 += temp * 2 * 4;
  1524. pb0 += temp * 2 * 2;
  1525. #endif
  1526. #ifdef LEFT
  1527. off += 4; // number of values in A
  1528. #endif
  1529. #endif
  1530. }
  1531. if (m & 2)
  1532. {
  1533. #if defined(TRMMKERNEL)
  1534. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1535. pb0 = B;
  1536. #else
  1537. pa0 += off * 2 * 2;
  1538. pb0 = B + off * 2 * 2;
  1539. #endif
  1540. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1541. temp = k - off;
  1542. #elif defined(LEFT)
  1543. temp = off + 2; // number of values in A
  1544. #else
  1545. temp = off + 2; // number of values in B
  1546. #endif
  1547. #else
  1548. pb0 = B;
  1549. temp = k;
  1550. #endif
  1551. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1552. CGEMM_KERNEL_2X2(, -, , +, +);
  1553. #endif
  1554. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1555. CGEMM_KERNEL_2X2(, +, , +, -);
  1556. #endif
  1557. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1558. CGEMM_KERNEL_2X2(, +, , -, +);
  1559. #endif
  1560. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1561. CGEMM_KERNEL_2X2(, -, , -, -);
  1562. #endif
  1563. pa0 += 4;
  1564. pb0 += 4;
  1565. for (l = (temp - 1); l--;)
  1566. {
  1567. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1568. CGEMM_KERNEL_2X2(+, -, +, +,);
  1569. #endif
  1570. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1571. CGEMM_KERNEL_2X2(+, +, -, +,);
  1572. #endif
  1573. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1574. CGEMM_KERNEL_2X2(+, +, +, -,);
  1575. #endif
  1576. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1577. CGEMM_KERNEL_2X2(+, -, -, -,);
  1578. #endif
  1579. pa0 += 4;
  1580. pb0 += 4;
  1581. }
  1582. #if defined(TRMMKERNEL)
  1583. CGEMM_TRMM_SCALE_2X2
  1584. #else
  1585. CGEMM_SCALE_2X2
  1586. #endif
  1587. #if defined(TRMMKERNEL)
  1588. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1589. temp = k - off;
  1590. #ifdef LEFT
  1591. temp -= 2; // number of values in A
  1592. #else
  1593. temp -= 2; // number of values in B
  1594. #endif
  1595. pa0 += temp * 2 * 2;
  1596. pb0 += temp * 2 * 2;
  1597. #endif
  1598. #ifdef LEFT
  1599. off += 2; // number of values in A
  1600. #endif
  1601. #endif
  1602. pc0 += 4;
  1603. pc1 += 4;
  1604. }
  1605. if (m & 1)
  1606. {
  1607. #if defined(TRMMKERNEL)
  1608. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1609. pb0 = B;
  1610. #else
  1611. pa0 += off * 2 * 1;
  1612. pb0 = B + off * 2 * 2;
  1613. #endif
  1614. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1615. temp = k - off;
  1616. #elif defined(LEFT)
  1617. temp = off + 1; // number of values in A
  1618. #else
  1619. temp = off + 2; // number of values in B
  1620. #endif
  1621. #else
  1622. pb0 = B;
  1623. temp = k;
  1624. #endif
  1625. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1626. CGEMM_KERNEL_1X2(, -, , +, +);
  1627. #endif
  1628. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1629. CGEMM_KERNEL_1X2(, +, , +, -);
  1630. #endif
  1631. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1632. CGEMM_KERNEL_1X2(, +, , -, +);
  1633. #endif
  1634. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1635. CGEMM_KERNEL_1X2(, -, , -, -);
  1636. #endif
  1637. pa0 += 2;
  1638. pb0 += 4;
  1639. for (l = (temp - 1); l--;)
  1640. {
  1641. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1642. CGEMM_KERNEL_1X2(+, -, +, +,);
  1643. #endif
  1644. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1645. CGEMM_KERNEL_1X2(+, +, -, +,);
  1646. #endif
  1647. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1648. CGEMM_KERNEL_1X2(+, +, +, -,);
  1649. #endif
  1650. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1651. CGEMM_KERNEL_1X2(+, -, -, -,);
  1652. #endif
  1653. pa0 += 2;
  1654. pb0 += 4;
  1655. }
  1656. #if defined(TRMMKERNEL)
  1657. CGEMM_TRMM_SCALE_1X2
  1658. #else
  1659. CGEMM_SCALE_1X2
  1660. #endif
  1661. #if defined(TRMMKERNEL)
  1662. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1663. temp = k - off;
  1664. #ifdef LEFT
  1665. temp -= 1; // number of values in A
  1666. #else
  1667. temp -= 2; // number of values in B
  1668. #endif
  1669. pa0 += temp * 2 * 1;
  1670. pb0 += temp * 2 * 2;
  1671. #endif
  1672. #ifdef LEFT
  1673. off += 1; // number of values in A
  1674. #endif
  1675. #endif
  1676. pc0 += 2;
  1677. pc1 += 2;
  1678. }
  1679. #if defined(TRMMKERNEL) && !defined(LEFT)
  1680. off += 2; // number of values in A
  1681. #endif
  1682. l = k << 2;
  1683. B = B + l;
  1684. i = ldc << 2;
  1685. C = C + i;
  1686. }
  1687. if (n & 1)
  1688. {
  1689. pc0 = C;
  1690. pa0 = A;
  1691. #if defined(TRMMKERNEL) && defined(LEFT)
  1692. off = offset;
  1693. #endif
  1694. for (i = (m >> 3); i--;)
  1695. {
  1696. #if defined(TRMMKERNEL)
  1697. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1698. pb0 = B;
  1699. #else
  1700. pa0 += off * 2 * 8;
  1701. pb0 = B + off * 2 * 1;
  1702. #endif
  1703. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1704. temp = k - off;
  1705. #elif defined(LEFT)
  1706. temp = off + 8; // number of values in A
  1707. #else
  1708. temp = off + 1; // number of values in B
  1709. #endif
  1710. #else
  1711. pb0 = B;
  1712. temp = k;
  1713. #endif
  1714. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1715. CGEMM_KERNEL_8X1_MSA(, -, , +, +);
  1716. #endif
  1717. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1718. CGEMM_KERNEL_8X1_MSA(, +, , +, -);
  1719. #endif
  1720. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1721. CGEMM_KERNEL_8X1_MSA(, +, , -, +);
  1722. #endif
  1723. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1724. CGEMM_KERNEL_8X1_MSA(, -, , -, -);
  1725. #endif
  1726. pb0 += 2;
  1727. for (l = (temp - 1); l--;)
  1728. {
  1729. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1730. CGEMM_KERNEL_8X1_MSA(+, -, +, +,);
  1731. #endif
  1732. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1733. CGEMM_KERNEL_8X1_MSA(+, +, -, +,);
  1734. #endif
  1735. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1736. CGEMM_KERNEL_8X1_MSA(+, +, +, -,);
  1737. #endif
  1738. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1739. CGEMM_KERNEL_8X1_MSA(+, -, -, -,);
  1740. #endif
  1741. pb0 += 2;
  1742. }
  1743. #if defined(TRMMKERNEL)
  1744. CGEMM_TRMM_SCALE_8X1_MSA
  1745. #else
  1746. CGEMM_SCALE_8X1_MSA
  1747. #endif
  1748. #if defined(TRMMKERNEL)
  1749. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1750. temp = k - off;
  1751. #ifdef LEFT
  1752. temp -= 8; // number of values in A
  1753. #else
  1754. temp -= 1; // number of values in B
  1755. #endif
  1756. pa0 += temp * 2 * 8;
  1757. pb0 += temp * 2 * 1;
  1758. #endif
  1759. #ifdef LEFT
  1760. off += 8; // number of values in A
  1761. #endif
  1762. #endif
  1763. }
  1764. if (m & 4)
  1765. {
  1766. #if defined(TRMMKERNEL)
  1767. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1768. pb0 = B;
  1769. #else
  1770. pa0 += off * 2 * 4;
  1771. pb0 = B + off * 2 * 1;
  1772. #endif
  1773. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1774. temp = k - off;
  1775. #elif defined(LEFT)
  1776. temp = off + 4; // number of values in A
  1777. #else
  1778. temp = off + 1; // number of values in B
  1779. #endif
  1780. #else
  1781. pb0 = B;
  1782. temp = k;
  1783. #endif
  1784. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1785. CGEMM_KERNEL_4X1_MSA(, -, , +, +);
  1786. #endif
  1787. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1788. CGEMM_KERNEL_4X1_MSA(, +, , +, -);
  1789. #endif
  1790. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1791. CGEMM_KERNEL_4X1_MSA(, +, , -, +);
  1792. #endif
  1793. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1794. CGEMM_KERNEL_4X1_MSA(, -, , -, -);
  1795. #endif
  1796. pb0 += 2;
  1797. for (l = (temp - 1); l--;)
  1798. {
  1799. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1800. CGEMM_KERNEL_4X1_MSA(+, -, +, +,);
  1801. #endif
  1802. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1803. CGEMM_KERNEL_4X1_MSA(+, +, -, +,);
  1804. #endif
  1805. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1806. CGEMM_KERNEL_4X1_MSA(+, +, +, -,);
  1807. #endif
  1808. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1809. CGEMM_KERNEL_4X1_MSA(+, -, -, -,);
  1810. #endif
  1811. pb0 += 2;
  1812. }
  1813. #if defined(TRMMKERNEL)
  1814. CGEMM_TRMM_SCALE_4X1_MSA
  1815. #else
  1816. CGEMM_SCALE_4X1_MSA
  1817. #endif
  1818. #if defined(TRMMKERNEL)
  1819. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1820. temp = k - off;
  1821. #ifdef LEFT
  1822. temp -= 4; // number of values in A
  1823. #else
  1824. temp -= 1; // number of values in B
  1825. #endif
  1826. pa0 += temp * 2 * 4;
  1827. pb0 += temp * 2 * 1;
  1828. #endif
  1829. #ifdef LEFT
  1830. off += 4; // number of values in A
  1831. #endif
  1832. #endif
  1833. }
  1834. if (m & 2)
  1835. {
  1836. #if defined(TRMMKERNEL)
  1837. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1838. pb0 = B;
  1839. #else
  1840. pa0 += off * 2 * 2;
  1841. pb0 = B + off * 2 * 1;
  1842. #endif
  1843. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1844. temp = k - off;
  1845. #elif defined(LEFT)
  1846. temp = off + 2; // number of values in A
  1847. #else
  1848. temp = off + 1; // number of values in B
  1849. #endif
  1850. #else
  1851. pb0 = B;
  1852. temp = k;
  1853. #endif
  1854. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1855. CGEMM_KERNEL_2X1(, -, , +, +);
  1856. #endif
  1857. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1858. CGEMM_KERNEL_2X1(, +, , +, -);
  1859. #endif
  1860. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1861. CGEMM_KERNEL_2X1(, +, , -, +);
  1862. #endif
  1863. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1864. CGEMM_KERNEL_2X1(, -, , -, -);
  1865. #endif
  1866. pa0 += 4;
  1867. pb0 += 2;
  1868. for (l = (temp - 1); l--;)
  1869. {
  1870. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1871. CGEMM_KERNEL_2X1(+, -, +, +,);
  1872. #endif
  1873. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1874. CGEMM_KERNEL_2X1(+, +, -, +,);
  1875. #endif
  1876. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1877. CGEMM_KERNEL_2X1(+, +, +, -,);
  1878. #endif
  1879. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1880. CGEMM_KERNEL_2X1(+, -, -, -,);
  1881. #endif
  1882. pa0 += 4;
  1883. pb0 += 2;
  1884. }
  1885. #if defined(TRMMKERNEL)
  1886. CGEMM_TRMM_SCALE_2X1
  1887. #else
  1888. CGEMM_SCALE_2X1
  1889. #endif
  1890. #if defined(TRMMKERNEL)
  1891. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1892. temp = k - off;
  1893. #ifdef LEFT
  1894. temp -= 2; // number of values in A
  1895. #else
  1896. temp -= 1; // number of values in B
  1897. #endif
  1898. pa0 += temp * 2 * 2;
  1899. pb0 += temp * 2 * 1;
  1900. #endif
  1901. #ifdef LEFT
  1902. off += 2; // number of values in A
  1903. #endif
  1904. #endif
  1905. pc0 += 4;
  1906. }
  1907. if (m & 1)
  1908. {
  1909. #if defined(TRMMKERNEL)
  1910. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1911. pb0 = B;
  1912. #else
  1913. pa0 += off * 2 * 1;
  1914. pb0 = B + off * 2 * 1;
  1915. #endif
  1916. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1917. temp = k - off;
  1918. #elif defined(LEFT)
  1919. temp = off + 1; // number of values in A
  1920. #else
  1921. temp = off + 1; // number of values in B
  1922. #endif
  1923. #else
  1924. pb0 = B;
  1925. temp = k;
  1926. #endif
  1927. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1928. CGEMM_KERNEL_1X1(, -, , +, +);
  1929. #endif
  1930. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1931. CGEMM_KERNEL_1X1(, +, , +, -);
  1932. #endif
  1933. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1934. CGEMM_KERNEL_1X1(, +, , -, +);
  1935. #endif
  1936. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1937. CGEMM_KERNEL_1X1(, -, , -, -);
  1938. #endif
  1939. pa0 += 2;
  1940. pb0 += 2;
  1941. for (l = (temp - 1); l--;)
  1942. {
  1943. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1944. CGEMM_KERNEL_1X1(+, -, +, +,);
  1945. #endif
  1946. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1947. CGEMM_KERNEL_1X1(+, +, -, +,);
  1948. #endif
  1949. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1950. CGEMM_KERNEL_1X1(+, +, +, -,);
  1951. #endif
  1952. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1953. CGEMM_KERNEL_1X1(+, -, -, -,);
  1954. #endif
  1955. pa0 += 2;
  1956. pb0 += 2;
  1957. }
  1958. #if defined(TRMMKERNEL)
  1959. CGEMM_TRMM_SCALE_1X1
  1960. #else
  1961. CGEMM_SCALE_1X1
  1962. #endif
  1963. #if defined(TRMMKERNEL)
  1964. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1965. temp = k - off;
  1966. #ifdef LEFT
  1967. temp -= 1; // number of values in A
  1968. #else
  1969. temp -= 1; // number of values in B
  1970. #endif
  1971. pa0 += temp * 2 * 1;
  1972. pb0 += temp * 2 * 1;
  1973. #endif
  1974. #ifdef LEFT
  1975. off += 1; // number of values in A
  1976. #endif
  1977. #endif
  1978. pc0 += 2;
  1979. }
  1980. #if defined(TRMMKERNEL) && !defined(LEFT)
  1981. off += 1; // number of values in A
  1982. #endif
  1983. l = k << 1;
  1984. B = B + l;
  1985. i = ldc << 1;
  1986. C = C + i;
  1987. }
  1988. return 0;
  1989. }