You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_8x8_zvl256b.c 86 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007
  1. /*
  2. AUTOGENERATED KERNEL
  3. Settings:
  4. LMUL=1
  5. M=8
  6. M_tail_scalar_from=1
  7. N=8
  8. __riscv_='__riscv_'
  9. complex=True
  10. conjugate=False
  11. cpu='zvl256b'
  12. force_acc_double=False
  13. index_type='BLASLONG'
  14. op='trmm'
  15. param_precision='float'
  16. reg_width_bits=256
  17. tail_policy=''
  18. trace=False
  19. Derived:
  20. ELEN_ACC=32
  21. ELEN_PARAM=32
  22. LMUL_ACC=1
  23. VFMACC='__riscv_vfmacc_vf_f32m1'
  24. VFMUL='__riscv_vfmul_vf_f32m1'
  25. VLEV='__riscv_vle32_v_f32m1'
  26. VLSEV='__riscv_vlse32_v_f32m1'
  27. VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
  28. VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
  29. VSETVL='__riscv_vsetvl_e32m1'
  30. VSEV='__riscv_vse32_v_f32m1'
  31. VSSEV='__riscv_vsse32_v_f32m1'
  32. acc_vector_t='vfloat32m1_t'
  33. output='ctrmm_kernel_8x8_zvl256b.c'
  34. param_scalar_t='float'
  35. param_vector_t='vfloat32m1_t'
  36. */
  37. #include "common.h"
  38. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  39. #define S0 1
  40. #define S1 -1
  41. #define S2 1
  42. #define S3 1
  43. #define VFMACC_RR __riscv_vfmsac
  44. #define VFMACC_RI __riscv_vfmacc
  45. #endif
  46. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  47. #define S0 1
  48. #define S1 1
  49. #define S2 1
  50. #define S3 -1
  51. #define VFMACC_RR __riscv_vfmacc
  52. #define VFMACC_RI __riscv_vfmsac
  53. #endif
  54. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  55. #define S0 1
  56. #define S1 1
  57. #define S2 -1
  58. #define S3 1
  59. #define VFMACC_RR __riscv_vfmacc
  60. #define VFMACC_RI __riscv_vfnmsac
  61. #endif
  62. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  63. #define S0 1
  64. #define S1 -1
  65. #define S2 -1
  66. #define S3 -1
  67. #define VFMACC_RR __riscv_vfmsac
  68. #define VFMACC_RI __riscv_vfnmacc
  69. #endif
  70. #if defined(LEFT) != defined(TRANSA)
  71. #define BACKWARDS
  72. #endif
  73. int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
  74. {
  75. BLASLONG gvl = 0;
  76. BLASLONG m_top = 0;
  77. BLASLONG n_top = 0;
  78. // -- MAIN PASS
  79. for (BLASLONG j=0; j<N/8; j+=1) {
  80. m_top = 0;
  81. BLASLONG gvl = __riscv_vsetvl_e32m1(8);
  82. for (BLASLONG i=0; i<M/8; i+=1) {
  83. BLASLONG ai=m_top*K*2;
  84. BLASLONG bi=n_top*K*2;
  85. BLASLONG pass_K = K;
  86. #ifdef LEFT
  87. BLASLONG off = offset + m_top;
  88. #else
  89. BLASLONG off = -offset + n_top;
  90. #endif
  91. #ifdef BACKWARDS
  92. ai += off*8*2;
  93. bi += off*8*2;
  94. pass_K -= off;
  95. #else
  96. #ifdef LEFT
  97. pass_K = off + 8;
  98. #else
  99. pass_K = off + 8;
  100. #endif
  101. #endif
  102. float B0r = B[bi+0*2+0];
  103. float B0i = B[bi+0*2+1];
  104. float B1r = B[bi+1*2+0];
  105. float B1i = B[bi+1*2+1];
  106. float B2r = B[bi+2*2+0];
  107. float B2i = B[bi+2*2+1];
  108. float B3r = B[bi+3*2+0];
  109. float B3i = B[bi+3*2+1];
  110. float B4r = B[bi+4*2+0];
  111. float B4i = B[bi+4*2+1];
  112. float B5r = B[bi+5*2+0];
  113. float B5i = B[bi+5*2+1];
  114. float B6r = B[bi+6*2+0];
  115. float B6i = B[bi+6*2+1];
  116. float B7r = B[bi+7*2+0];
  117. float B7i = B[bi+7*2+1];
  118. bi += 8*2;
  119. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  120. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  121. ai += 8*2;
  122. // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
  123. // leaving 14 vector registers for temporaries
  124. // performing 4 operations between reuses of temporaries
  125. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  126. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  127. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  128. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  129. vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  130. vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  131. vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  132. vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  133. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  134. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  135. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  136. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  137. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  138. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  139. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  140. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  141. vfloat32m1_t ACC0r = tmp0r;
  142. vfloat32m1_t ACC0i = tmp0i;
  143. vfloat32m1_t ACC1r = tmp1r;
  144. vfloat32m1_t ACC1i = tmp1i;
  145. vfloat32m1_t ACC2r = tmp2r;
  146. vfloat32m1_t ACC2i = tmp2i;
  147. vfloat32m1_t ACC3r = tmp3r;
  148. vfloat32m1_t ACC3i = tmp3i;
  149. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
  150. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
  151. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
  152. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
  153. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
  154. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
  155. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
  156. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
  157. tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
  158. tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
  159. tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
  160. tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
  161. tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
  162. tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
  163. tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
  164. tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
  165. vfloat32m1_t ACC4r = tmp0r;
  166. vfloat32m1_t ACC4i = tmp0i;
  167. vfloat32m1_t ACC5r = tmp1r;
  168. vfloat32m1_t ACC5i = tmp1i;
  169. vfloat32m1_t ACC6r = tmp2r;
  170. vfloat32m1_t ACC6i = tmp2i;
  171. vfloat32m1_t ACC7r = tmp3r;
  172. vfloat32m1_t ACC7i = tmp3i;
  173. for(BLASLONG k=1; k<pass_K; k++) {
  174. B0r = B[bi+0*2+0];
  175. B0i = B[bi+0*2+1];
  176. B1r = B[bi+1*2+0];
  177. B1i = B[bi+1*2+1];
  178. B2r = B[bi+2*2+0];
  179. B2i = B[bi+2*2+1];
  180. B3r = B[bi+3*2+0];
  181. B3i = B[bi+3*2+1];
  182. B4r = B[bi+4*2+0];
  183. B4i = B[bi+4*2+1];
  184. B5r = B[bi+5*2+0];
  185. B5i = B[bi+5*2+1];
  186. B6r = B[bi+6*2+0];
  187. B6i = B[bi+6*2+1];
  188. B7r = B[bi+7*2+0];
  189. B7i = B[bi+7*2+1];
  190. bi += 8*2;
  191. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  192. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  193. ai += 8*2;
  194. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  195. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  196. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  197. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  198. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  199. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  200. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  201. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  202. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  203. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  204. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  205. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  206. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  207. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  208. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  209. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  210. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  211. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  212. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  213. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  214. ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
  215. ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
  216. ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
  217. ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
  218. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
  219. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
  220. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
  221. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
  222. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
  223. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
  224. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
  225. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
  226. tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
  227. tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
  228. tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
  229. tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
  230. tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
  231. tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
  232. tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
  233. tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
  234. ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
  235. ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
  236. ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
  237. ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
  238. ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
  239. ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
  240. ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
  241. ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
  242. }
  243. BLASLONG ci=n_top*ldc+m_top;
  244. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  245. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  246. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  247. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  248. vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
  249. vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
  250. vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
  251. vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
  252. vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
  253. vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
  254. vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
  255. vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
  256. vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
  257. vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
  258. vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
  259. vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
  260. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  261. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  262. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  263. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  264. C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
  265. C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
  266. C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
  267. C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
  268. C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
  269. C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
  270. C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
  271. C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
  272. C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
  273. C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
  274. C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
  275. C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
  276. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  277. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  278. ci += ldc-gvl*0;
  279. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  280. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  281. ci += ldc-gvl*0;
  282. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
  283. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
  284. ci += ldc-gvl*0;
  285. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
  286. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
  287. ci += ldc-gvl*0;
  288. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
  289. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
  290. ci += ldc-gvl*0;
  291. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
  292. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
  293. ci += ldc-gvl*0;
  294. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
  295. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
  296. ci += ldc-gvl*0;
  297. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
  298. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
  299. m_top += 8;
  300. }
  301. // -- tails for main pass
  302. if( M & 4 ) {
  303. gvl = __riscv_vsetvl_e32m1(4);
  304. BLASLONG ai=m_top*K*2;
  305. BLASLONG bi=n_top*K*2;
  306. BLASLONG pass_K = K;
  307. #ifdef LEFT
  308. BLASLONG off = offset + m_top;
  309. #else
  310. BLASLONG off = -offset + n_top;
  311. #endif
  312. #ifdef BACKWARDS
  313. ai += off*4*2;
  314. bi += off*8*2;
  315. pass_K -= off;
  316. #else
  317. #ifdef LEFT
  318. pass_K = off + 4;
  319. #else
  320. pass_K = off + 8;
  321. #endif
  322. #endif
  323. float B0r = B[bi+0*2+0];
  324. float B0i = B[bi+0*2+1];
  325. float B1r = B[bi+1*2+0];
  326. float B1i = B[bi+1*2+1];
  327. float B2r = B[bi+2*2+0];
  328. float B2i = B[bi+2*2+1];
  329. float B3r = B[bi+3*2+0];
  330. float B3i = B[bi+3*2+1];
  331. float B4r = B[bi+4*2+0];
  332. float B4i = B[bi+4*2+1];
  333. float B5r = B[bi+5*2+0];
  334. float B5i = B[bi+5*2+1];
  335. float B6r = B[bi+6*2+0];
  336. float B6i = B[bi+6*2+1];
  337. float B7r = B[bi+7*2+0];
  338. float B7i = B[bi+7*2+1];
  339. bi += 8*2;
  340. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  341. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  342. ai += 4*2;
  343. // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
  344. // leaving 14 vector registers for temporaries
  345. // performing 4 operations between reuses of temporaries
  346. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  347. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  348. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  349. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  350. vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  351. vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  352. vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  353. vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  354. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  355. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  356. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  357. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  358. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  359. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  360. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  361. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  362. vfloat32m1_t ACC0r = tmp0r;
  363. vfloat32m1_t ACC0i = tmp0i;
  364. vfloat32m1_t ACC1r = tmp1r;
  365. vfloat32m1_t ACC1i = tmp1i;
  366. vfloat32m1_t ACC2r = tmp2r;
  367. vfloat32m1_t ACC2i = tmp2i;
  368. vfloat32m1_t ACC3r = tmp3r;
  369. vfloat32m1_t ACC3i = tmp3i;
  370. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
  371. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
  372. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
  373. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
  374. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
  375. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
  376. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
  377. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
  378. tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
  379. tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
  380. tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
  381. tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
  382. tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
  383. tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
  384. tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
  385. tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
  386. vfloat32m1_t ACC4r = tmp0r;
  387. vfloat32m1_t ACC4i = tmp0i;
  388. vfloat32m1_t ACC5r = tmp1r;
  389. vfloat32m1_t ACC5i = tmp1i;
  390. vfloat32m1_t ACC6r = tmp2r;
  391. vfloat32m1_t ACC6i = tmp2i;
  392. vfloat32m1_t ACC7r = tmp3r;
  393. vfloat32m1_t ACC7i = tmp3i;
  394. for(BLASLONG k=1; k<pass_K; k++) {
  395. B0r = B[bi+0*2+0];
  396. B0i = B[bi+0*2+1];
  397. B1r = B[bi+1*2+0];
  398. B1i = B[bi+1*2+1];
  399. B2r = B[bi+2*2+0];
  400. B2i = B[bi+2*2+1];
  401. B3r = B[bi+3*2+0];
  402. B3i = B[bi+3*2+1];
  403. B4r = B[bi+4*2+0];
  404. B4i = B[bi+4*2+1];
  405. B5r = B[bi+5*2+0];
  406. B5i = B[bi+5*2+1];
  407. B6r = B[bi+6*2+0];
  408. B6i = B[bi+6*2+1];
  409. B7r = B[bi+7*2+0];
  410. B7i = B[bi+7*2+1];
  411. bi += 8*2;
  412. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  413. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  414. ai += 4*2;
  415. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  416. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  417. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  418. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  419. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  420. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  421. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  422. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  423. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  424. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  425. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  426. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  427. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  428. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  429. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  430. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  431. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  432. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  433. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  434. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  435. ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
  436. ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
  437. ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
  438. ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
  439. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
  440. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
  441. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
  442. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
  443. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
  444. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
  445. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
  446. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
  447. tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
  448. tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
  449. tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
  450. tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
  451. tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
  452. tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
  453. tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
  454. tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
  455. ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
  456. ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
  457. ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
  458. ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
  459. ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
  460. ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
  461. ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
  462. ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
  463. }
  464. BLASLONG ci=n_top*ldc+m_top;
  465. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  466. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  467. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  468. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  469. vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
  470. vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
  471. vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
  472. vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
  473. vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
  474. vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
  475. vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
  476. vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
  477. vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
  478. vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
  479. vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
  480. vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
  481. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  482. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  483. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  484. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  485. C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
  486. C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
  487. C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
  488. C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
  489. C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
  490. C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
  491. C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
  492. C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
  493. C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
  494. C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
  495. C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
  496. C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
  497. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  498. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  499. ci += ldc-gvl*0;
  500. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  501. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  502. ci += ldc-gvl*0;
  503. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
  504. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
  505. ci += ldc-gvl*0;
  506. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
  507. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
  508. ci += ldc-gvl*0;
  509. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
  510. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
  511. ci += ldc-gvl*0;
  512. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
  513. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
  514. ci += ldc-gvl*0;
  515. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
  516. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
  517. ci += ldc-gvl*0;
  518. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
  519. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
  520. m_top += 4;
  521. }
  522. if( M & 2 ) {
  523. gvl = __riscv_vsetvl_e32m1(2);
  524. BLASLONG ai=m_top*K*2;
  525. BLASLONG bi=n_top*K*2;
  526. BLASLONG pass_K = K;
  527. #ifdef LEFT
  528. BLASLONG off = offset + m_top;
  529. #else
  530. BLASLONG off = -offset + n_top;
  531. #endif
  532. #ifdef BACKWARDS
  533. ai += off*2*2;
  534. bi += off*8*2;
  535. pass_K -= off;
  536. #else
  537. #ifdef LEFT
  538. pass_K = off + 2;
  539. #else
  540. pass_K = off + 8;
  541. #endif
  542. #endif
  543. float B0r = B[bi+0*2+0];
  544. float B0i = B[bi+0*2+1];
  545. float B1r = B[bi+1*2+0];
  546. float B1i = B[bi+1*2+1];
  547. float B2r = B[bi+2*2+0];
  548. float B2i = B[bi+2*2+1];
  549. float B3r = B[bi+3*2+0];
  550. float B3i = B[bi+3*2+1];
  551. float B4r = B[bi+4*2+0];
  552. float B4i = B[bi+4*2+1];
  553. float B5r = B[bi+5*2+0];
  554. float B5i = B[bi+5*2+1];
  555. float B6r = B[bi+6*2+0];
  556. float B6i = B[bi+6*2+1];
  557. float B7r = B[bi+7*2+0];
  558. float B7i = B[bi+7*2+1];
  559. bi += 8*2;
  560. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  561. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  562. ai += 2*2;
  563. // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
  564. // leaving 14 vector registers for temporaries
  565. // performing 4 operations between reuses of temporaries
  566. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  567. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  568. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  569. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  570. vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  571. vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  572. vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  573. vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  574. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  575. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  576. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  577. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  578. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  579. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  580. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  581. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  582. vfloat32m1_t ACC0r = tmp0r;
  583. vfloat32m1_t ACC0i = tmp0i;
  584. vfloat32m1_t ACC1r = tmp1r;
  585. vfloat32m1_t ACC1i = tmp1i;
  586. vfloat32m1_t ACC2r = tmp2r;
  587. vfloat32m1_t ACC2i = tmp2i;
  588. vfloat32m1_t ACC3r = tmp3r;
  589. vfloat32m1_t ACC3i = tmp3i;
  590. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
  591. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
  592. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
  593. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
  594. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
  595. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
  596. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
  597. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
  598. tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
  599. tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
  600. tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
  601. tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
  602. tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
  603. tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
  604. tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
  605. tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
  606. vfloat32m1_t ACC4r = tmp0r;
  607. vfloat32m1_t ACC4i = tmp0i;
  608. vfloat32m1_t ACC5r = tmp1r;
  609. vfloat32m1_t ACC5i = tmp1i;
  610. vfloat32m1_t ACC6r = tmp2r;
  611. vfloat32m1_t ACC6i = tmp2i;
  612. vfloat32m1_t ACC7r = tmp3r;
  613. vfloat32m1_t ACC7i = tmp3i;
  614. for(BLASLONG k=1; k<pass_K; k++) {
  615. B0r = B[bi+0*2+0];
  616. B0i = B[bi+0*2+1];
  617. B1r = B[bi+1*2+0];
  618. B1i = B[bi+1*2+1];
  619. B2r = B[bi+2*2+0];
  620. B2i = B[bi+2*2+1];
  621. B3r = B[bi+3*2+0];
  622. B3i = B[bi+3*2+1];
  623. B4r = B[bi+4*2+0];
  624. B4i = B[bi+4*2+1];
  625. B5r = B[bi+5*2+0];
  626. B5i = B[bi+5*2+1];
  627. B6r = B[bi+6*2+0];
  628. B6i = B[bi+6*2+1];
  629. B7r = B[bi+7*2+0];
  630. B7i = B[bi+7*2+1];
  631. bi += 8*2;
  632. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  633. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  634. ai += 2*2;
  635. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  636. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  637. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  638. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  639. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  640. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  641. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  642. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  643. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  644. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  645. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  646. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  647. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  648. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  649. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  650. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  651. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  652. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  653. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  654. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  655. ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
  656. ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
  657. ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
  658. ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
  659. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
  660. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
  661. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
  662. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
  663. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
  664. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
  665. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
  666. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
  667. tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
  668. tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
  669. tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
  670. tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
  671. tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
  672. tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
  673. tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
  674. tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
  675. ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
  676. ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
  677. ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
  678. ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
  679. ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
  680. ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
  681. ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
  682. ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
  683. }
  684. BLASLONG ci=n_top*ldc+m_top;
  685. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  686. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  687. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  688. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  689. vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
  690. vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
  691. vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
  692. vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
  693. vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
  694. vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
  695. vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
  696. vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
  697. vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
  698. vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
  699. vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
  700. vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
  701. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  702. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  703. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  704. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  705. C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
  706. C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
  707. C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
  708. C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
  709. C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
  710. C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
  711. C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
  712. C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
  713. C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
  714. C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
  715. C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
  716. C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
  717. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  718. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  719. ci += ldc-gvl*0;
  720. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  721. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  722. ci += ldc-gvl*0;
  723. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
  724. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
  725. ci += ldc-gvl*0;
  726. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
  727. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
  728. ci += ldc-gvl*0;
  729. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
  730. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
  731. ci += ldc-gvl*0;
  732. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
  733. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
  734. ci += ldc-gvl*0;
  735. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
  736. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
  737. ci += ldc-gvl*0;
  738. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
  739. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
  740. m_top += 2;
  741. }
  742. if( M & 1 ) {
  743. float result0 = 0;
  744. float result1 = 0;
  745. float result2 = 0;
  746. float result3 = 0;
  747. float result4 = 0;
  748. float result5 = 0;
  749. float result6 = 0;
  750. float result7 = 0;
  751. float result8 = 0;
  752. float result9 = 0;
  753. float result10 = 0;
  754. float result11 = 0;
  755. float result12 = 0;
  756. float result13 = 0;
  757. float result14 = 0;
  758. float result15 = 0;
  759. BLASLONG ai=m_top*K*2;
  760. BLASLONG bi=n_top*K*2;
  761. BLASLONG pass_K = K;
  762. #ifdef LEFT
  763. BLASLONG off = offset + m_top;
  764. #else
  765. BLASLONG off = -offset + n_top;
  766. #endif
  767. #ifdef BACKWARDS
  768. ai += off*1*2;
  769. bi += off*8*2;
  770. pass_K -= off;
  771. #else
  772. #ifdef LEFT
  773. pass_K = off + 1;
  774. #else
  775. pass_K = off + 8;
  776. #endif
  777. #endif
  778. for(BLASLONG k=0; k<pass_K; k++) {
  779. result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
  780. result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
  781. result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
  782. result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
  783. result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
  784. result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
  785. result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
  786. result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
  787. result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
  788. result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
  789. result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
  790. result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
  791. result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
  792. result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
  793. result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
  794. result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
  795. ai+=1*2;
  796. bi+=8*2;
  797. }
  798. BLASLONG ci=n_top*ldc+m_top;
  799. float Cr, Ci;
  800. Cr = result0*alphar;
  801. Ci = result1*alphar;
  802. Cr -= result1*alphai;
  803. Ci += result0*alphai;
  804. C[(ci+0*ldc+0)*2+0] = Cr;
  805. C[(ci+0*ldc+0)*2+1] = Ci;
  806. Cr = result2*alphar;
  807. Ci = result3*alphar;
  808. Cr -= result3*alphai;
  809. Ci += result2*alphai;
  810. C[(ci+1*ldc+0)*2+0] = Cr;
  811. C[(ci+1*ldc+0)*2+1] = Ci;
  812. Cr = result4*alphar;
  813. Ci = result5*alphar;
  814. Cr -= result5*alphai;
  815. Ci += result4*alphai;
  816. C[(ci+2*ldc+0)*2+0] = Cr;
  817. C[(ci+2*ldc+0)*2+1] = Ci;
  818. Cr = result6*alphar;
  819. Ci = result7*alphar;
  820. Cr -= result7*alphai;
  821. Ci += result6*alphai;
  822. C[(ci+3*ldc+0)*2+0] = Cr;
  823. C[(ci+3*ldc+0)*2+1] = Ci;
  824. Cr = result8*alphar;
  825. Ci = result9*alphar;
  826. Cr -= result9*alphai;
  827. Ci += result8*alphai;
  828. C[(ci+4*ldc+0)*2+0] = Cr;
  829. C[(ci+4*ldc+0)*2+1] = Ci;
  830. Cr = result10*alphar;
  831. Ci = result11*alphar;
  832. Cr -= result11*alphai;
  833. Ci += result10*alphai;
  834. C[(ci+5*ldc+0)*2+0] = Cr;
  835. C[(ci+5*ldc+0)*2+1] = Ci;
  836. Cr = result12*alphar;
  837. Ci = result13*alphar;
  838. Cr -= result13*alphai;
  839. Ci += result12*alphai;
  840. C[(ci+6*ldc+0)*2+0] = Cr;
  841. C[(ci+6*ldc+0)*2+1] = Ci;
  842. Cr = result14*alphar;
  843. Ci = result15*alphar;
  844. Cr -= result15*alphai;
  845. Ci += result14*alphai;
  846. C[(ci+7*ldc+0)*2+0] = Cr;
  847. C[(ci+7*ldc+0)*2+1] = Ci;
  848. m_top+=1;
  849. }
  850. n_top += 8;
  851. }
  852. // -- tails for N=4
  853. if( N & 4 ) {
  854. gvl = __riscv_vsetvl_e32m1(8);
  855. m_top = 0;
  856. for (BLASLONG i=0; i<M/8; i+=1) {
  857. BLASLONG ai=m_top*K*2;
  858. BLASLONG bi=n_top*K*2;
  859. BLASLONG pass_K = K;
  860. #ifdef LEFT
  861. BLASLONG off = offset + m_top;
  862. #else
  863. BLASLONG off = -offset + n_top;
  864. #endif
  865. #ifdef BACKWARDS
  866. ai += off*8*2;
  867. bi += off*4*2;
  868. pass_K -= off;
  869. #else
  870. #ifdef LEFT
  871. pass_K = off + 8;
  872. #else
  873. pass_K = off + 4;
  874. #endif
  875. #endif
  876. float B0r = B[bi+0*2+0];
  877. float B0i = B[bi+0*2+1];
  878. float B1r = B[bi+1*2+0];
  879. float B1i = B[bi+1*2+1];
  880. float B2r = B[bi+2*2+0];
  881. float B2i = B[bi+2*2+1];
  882. float B3r = B[bi+3*2+0];
  883. float B3i = B[bi+3*2+1];
  884. bi += 4*2;
  885. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  886. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  887. ai += 8*2;
  888. // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
  889. // leaving 22 vector registers for temporaries
  890. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  891. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  892. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  893. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  894. vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  895. vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  896. vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  897. vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  898. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  899. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  900. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  901. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  902. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  903. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  904. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  905. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  906. vfloat32m1_t ACC0r = tmp0r;
  907. vfloat32m1_t ACC0i = tmp0i;
  908. vfloat32m1_t ACC1r = tmp1r;
  909. vfloat32m1_t ACC1i = tmp1i;
  910. vfloat32m1_t ACC2r = tmp2r;
  911. vfloat32m1_t ACC2i = tmp2i;
  912. vfloat32m1_t ACC3r = tmp3r;
  913. vfloat32m1_t ACC3i = tmp3i;
  914. for(BLASLONG k=1; k<pass_K; k++) {
  915. B0r = B[bi+0*2+0];
  916. B0i = B[bi+0*2+1];
  917. B1r = B[bi+1*2+0];
  918. B1i = B[bi+1*2+1];
  919. B2r = B[bi+2*2+0];
  920. B2i = B[bi+2*2+1];
  921. B3r = B[bi+3*2+0];
  922. B3i = B[bi+3*2+1];
  923. bi += 4*2;
  924. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  925. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  926. ai += 8*2;
  927. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  928. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  929. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  930. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  931. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  932. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  933. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  934. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  935. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  936. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  937. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  938. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  939. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  940. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  941. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  942. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  943. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  944. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  945. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  946. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  947. ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
  948. ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
  949. ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
  950. ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
  951. }
  952. BLASLONG ci=n_top*ldc+m_top;
  953. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  954. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  955. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  956. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  957. vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
  958. vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
  959. vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
  960. vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
  961. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  962. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  963. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  964. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  965. C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
  966. C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
  967. C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
  968. C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
  969. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  970. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  971. ci += ldc-gvl*0;
  972. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  973. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  974. ci += ldc-gvl*0;
  975. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
  976. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
  977. ci += ldc-gvl*0;
  978. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
  979. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
  980. m_top += 8;
  981. }
  982. if( M & 4 ) {
  983. gvl = __riscv_vsetvl_e32m1(4);
  984. BLASLONG ai=m_top*K*2;
  985. BLASLONG bi=n_top*K*2;
  986. BLASLONG pass_K = K;
  987. #ifdef LEFT
  988. BLASLONG off = offset + m_top;
  989. #else
  990. BLASLONG off = -offset + n_top;
  991. #endif
  992. #ifdef BACKWARDS
  993. ai += off*4*2;
  994. bi += off*4*2;
  995. pass_K -= off;
  996. #else
  997. #ifdef LEFT
  998. pass_K = off + 4;
  999. #else
  1000. pass_K = off + 4;
  1001. #endif
  1002. #endif
  1003. float B0r = B[bi+0*2+0];
  1004. float B0i = B[bi+0*2+1];
  1005. float B1r = B[bi+1*2+0];
  1006. float B1i = B[bi+1*2+1];
  1007. float B2r = B[bi+2*2+0];
  1008. float B2i = B[bi+2*2+1];
  1009. float B3r = B[bi+3*2+0];
  1010. float B3i = B[bi+3*2+1];
  1011. bi += 4*2;
  1012. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1013. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1014. ai += 4*2;
  1015. // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
  1016. // leaving 22 vector registers for temporaries
  1017. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1018. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1019. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1020. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1021. vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  1022. vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  1023. vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  1024. vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  1025. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1026. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1027. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1028. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1029. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  1030. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  1031. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  1032. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  1033. vfloat32m1_t ACC0r = tmp0r;
  1034. vfloat32m1_t ACC0i = tmp0i;
  1035. vfloat32m1_t ACC1r = tmp1r;
  1036. vfloat32m1_t ACC1i = tmp1i;
  1037. vfloat32m1_t ACC2r = tmp2r;
  1038. vfloat32m1_t ACC2i = tmp2i;
  1039. vfloat32m1_t ACC3r = tmp3r;
  1040. vfloat32m1_t ACC3i = tmp3i;
  1041. for(BLASLONG k=1; k<pass_K; k++) {
  1042. B0r = B[bi+0*2+0];
  1043. B0i = B[bi+0*2+1];
  1044. B1r = B[bi+1*2+0];
  1045. B1i = B[bi+1*2+1];
  1046. B2r = B[bi+2*2+0];
  1047. B2i = B[bi+2*2+1];
  1048. B3r = B[bi+3*2+0];
  1049. B3i = B[bi+3*2+1];
  1050. bi += 4*2;
  1051. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1052. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1053. ai += 4*2;
  1054. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1055. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1056. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1057. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1058. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  1059. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  1060. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  1061. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  1062. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1063. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1064. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1065. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1066. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  1067. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  1068. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  1069. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  1070. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1071. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1072. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  1073. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  1074. ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
  1075. ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
  1076. ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
  1077. ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
  1078. }
  1079. BLASLONG ci=n_top*ldc+m_top;
  1080. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1081. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1082. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  1083. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  1084. vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
  1085. vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
  1086. vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
  1087. vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
  1088. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1089. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1090. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  1091. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  1092. C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
  1093. C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
  1094. C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
  1095. C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
  1096. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1097. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1098. ci += ldc-gvl*0;
  1099. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  1100. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  1101. ci += ldc-gvl*0;
  1102. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
  1103. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
  1104. ci += ldc-gvl*0;
  1105. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
  1106. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
  1107. m_top += 4;
  1108. }
  1109. if( M & 2 ) {
  1110. gvl = __riscv_vsetvl_e32m1(2);
  1111. BLASLONG ai=m_top*K*2;
  1112. BLASLONG bi=n_top*K*2;
  1113. BLASLONG pass_K = K;
  1114. #ifdef LEFT
  1115. BLASLONG off = offset + m_top;
  1116. #else
  1117. BLASLONG off = -offset + n_top;
  1118. #endif
  1119. #ifdef BACKWARDS
  1120. ai += off*2*2;
  1121. bi += off*4*2;
  1122. pass_K -= off;
  1123. #else
  1124. #ifdef LEFT
  1125. pass_K = off + 2;
  1126. #else
  1127. pass_K = off + 4;
  1128. #endif
  1129. #endif
  1130. float B0r = B[bi+0*2+0];
  1131. float B0i = B[bi+0*2+1];
  1132. float B1r = B[bi+1*2+0];
  1133. float B1i = B[bi+1*2+1];
  1134. float B2r = B[bi+2*2+0];
  1135. float B2i = B[bi+2*2+1];
  1136. float B3r = B[bi+3*2+0];
  1137. float B3i = B[bi+3*2+1];
  1138. bi += 4*2;
  1139. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1140. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1141. ai += 2*2;
  1142. // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
  1143. // leaving 22 vector registers for temporaries
  1144. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1145. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1146. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1147. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1148. vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  1149. vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  1150. vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  1151. vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  1152. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1153. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1154. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1155. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1156. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  1157. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  1158. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  1159. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  1160. vfloat32m1_t ACC0r = tmp0r;
  1161. vfloat32m1_t ACC0i = tmp0i;
  1162. vfloat32m1_t ACC1r = tmp1r;
  1163. vfloat32m1_t ACC1i = tmp1i;
  1164. vfloat32m1_t ACC2r = tmp2r;
  1165. vfloat32m1_t ACC2i = tmp2i;
  1166. vfloat32m1_t ACC3r = tmp3r;
  1167. vfloat32m1_t ACC3i = tmp3i;
  1168. for(BLASLONG k=1; k<pass_K; k++) {
  1169. B0r = B[bi+0*2+0];
  1170. B0i = B[bi+0*2+1];
  1171. B1r = B[bi+1*2+0];
  1172. B1i = B[bi+1*2+1];
  1173. B2r = B[bi+2*2+0];
  1174. B2i = B[bi+2*2+1];
  1175. B3r = B[bi+3*2+0];
  1176. B3i = B[bi+3*2+1];
  1177. bi += 4*2;
  1178. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1179. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1180. ai += 2*2;
  1181. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1182. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1183. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1184. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1185. tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
  1186. tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
  1187. tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
  1188. tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
  1189. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1190. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1191. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1192. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1193. tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
  1194. tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
  1195. tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
  1196. tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
  1197. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1198. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1199. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  1200. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  1201. ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
  1202. ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
  1203. ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
  1204. ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
  1205. }
  1206. BLASLONG ci=n_top*ldc+m_top;
  1207. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1208. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1209. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  1210. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  1211. vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
  1212. vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
  1213. vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
  1214. vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
  1215. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1216. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1217. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  1218. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  1219. C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
  1220. C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
  1221. C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
  1222. C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
  1223. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1224. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1225. ci += ldc-gvl*0;
  1226. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  1227. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  1228. ci += ldc-gvl*0;
  1229. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
  1230. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
  1231. ci += ldc-gvl*0;
  1232. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
  1233. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
  1234. m_top += 2;
  1235. }
  1236. if( M & 1 ) {
  1237. float result0 = 0;
  1238. float result1 = 0;
  1239. float result2 = 0;
  1240. float result3 = 0;
  1241. float result4 = 0;
  1242. float result5 = 0;
  1243. float result6 = 0;
  1244. float result7 = 0;
  1245. BLASLONG ai=m_top*K*2;
  1246. BLASLONG bi=n_top*K*2;
  1247. BLASLONG pass_K = K;
  1248. #ifdef LEFT
  1249. BLASLONG off = offset + m_top;
  1250. #else
  1251. BLASLONG off = -offset + n_top;
  1252. #endif
  1253. #ifdef BACKWARDS
  1254. ai += off*1*2;
  1255. bi += off*4*2;
  1256. pass_K -= off;
  1257. #else
  1258. #ifdef LEFT
  1259. pass_K = off + 1;
  1260. #else
  1261. pass_K = off + 4;
  1262. #endif
  1263. #endif
  1264. for(BLASLONG k=0; k<pass_K; k++) {
  1265. result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
  1266. result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
  1267. result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
  1268. result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
  1269. result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
  1270. result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
  1271. result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
  1272. result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
  1273. ai+=1*2;
  1274. bi+=4*2;
  1275. }
  1276. BLASLONG ci=n_top*ldc+m_top;
  1277. float Cr, Ci;
  1278. Cr = result0*alphar;
  1279. Ci = result1*alphar;
  1280. Cr -= result1*alphai;
  1281. Ci += result0*alphai;
  1282. C[(ci+0*ldc+0)*2+0] = Cr;
  1283. C[(ci+0*ldc+0)*2+1] = Ci;
  1284. Cr = result2*alphar;
  1285. Ci = result3*alphar;
  1286. Cr -= result3*alphai;
  1287. Ci += result2*alphai;
  1288. C[(ci+1*ldc+0)*2+0] = Cr;
  1289. C[(ci+1*ldc+0)*2+1] = Ci;
  1290. Cr = result4*alphar;
  1291. Ci = result5*alphar;
  1292. Cr -= result5*alphai;
  1293. Ci += result4*alphai;
  1294. C[(ci+2*ldc+0)*2+0] = Cr;
  1295. C[(ci+2*ldc+0)*2+1] = Ci;
  1296. Cr = result6*alphar;
  1297. Ci = result7*alphar;
  1298. Cr -= result7*alphai;
  1299. Ci += result6*alphai;
  1300. C[(ci+3*ldc+0)*2+0] = Cr;
  1301. C[(ci+3*ldc+0)*2+1] = Ci;
  1302. m_top+=1;
  1303. }
  1304. n_top += 4;
  1305. }
  1306. // -- tails for N=2
  1307. if( N & 2 ) {
  1308. gvl = __riscv_vsetvl_e32m1(8);
  1309. m_top = 0;
  1310. for (BLASLONG i=0; i<M/8; i+=1) {
  1311. BLASLONG ai=m_top*K*2;
  1312. BLASLONG bi=n_top*K*2;
  1313. BLASLONG pass_K = K;
  1314. #ifdef LEFT
  1315. BLASLONG off = offset + m_top;
  1316. #else
  1317. BLASLONG off = -offset + n_top;
  1318. #endif
  1319. #ifdef BACKWARDS
  1320. ai += off*8*2;
  1321. bi += off*2*2;
  1322. pass_K -= off;
  1323. #else
  1324. #ifdef LEFT
  1325. pass_K = off + 8;
  1326. #else
  1327. pass_K = off + 2;
  1328. #endif
  1329. #endif
  1330. float B0r = B[bi+0*2+0];
  1331. float B0i = B[bi+0*2+1];
  1332. float B1r = B[bi+1*2+0];
  1333. float B1i = B[bi+1*2+1];
  1334. bi += 2*2;
  1335. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1336. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1337. ai += 8*2;
  1338. // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
  1339. // leaving 26 vector registers for temporaries
  1340. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1341. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1342. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1343. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1344. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1345. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1346. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1347. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1348. vfloat32m1_t ACC0r = tmp0r;
  1349. vfloat32m1_t ACC0i = tmp0i;
  1350. vfloat32m1_t ACC1r = tmp1r;
  1351. vfloat32m1_t ACC1i = tmp1i;
  1352. for(BLASLONG k=1; k<pass_K; k++) {
  1353. B0r = B[bi+0*2+0];
  1354. B0i = B[bi+0*2+1];
  1355. B1r = B[bi+1*2+0];
  1356. B1i = B[bi+1*2+1];
  1357. bi += 2*2;
  1358. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1359. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1360. ai += 8*2;
  1361. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1362. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1363. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1364. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1365. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1366. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1367. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1368. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1369. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1370. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1371. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  1372. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  1373. }
  1374. BLASLONG ci=n_top*ldc+m_top;
  1375. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1376. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1377. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  1378. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  1379. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1380. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1381. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  1382. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  1383. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1384. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1385. ci += ldc-gvl*0;
  1386. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  1387. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  1388. m_top += 8;
  1389. }
  1390. if( M & 4 ) {
  1391. gvl = __riscv_vsetvl_e32m1(4);
  1392. BLASLONG ai=m_top*K*2;
  1393. BLASLONG bi=n_top*K*2;
  1394. BLASLONG pass_K = K;
  1395. #ifdef LEFT
  1396. BLASLONG off = offset + m_top;
  1397. #else
  1398. BLASLONG off = -offset + n_top;
  1399. #endif
  1400. #ifdef BACKWARDS
  1401. ai += off*4*2;
  1402. bi += off*2*2;
  1403. pass_K -= off;
  1404. #else
  1405. #ifdef LEFT
  1406. pass_K = off + 4;
  1407. #else
  1408. pass_K = off + 2;
  1409. #endif
  1410. #endif
  1411. float B0r = B[bi+0*2+0];
  1412. float B0i = B[bi+0*2+1];
  1413. float B1r = B[bi+1*2+0];
  1414. float B1i = B[bi+1*2+1];
  1415. bi += 2*2;
  1416. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1417. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1418. ai += 4*2;
  1419. // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
  1420. // leaving 26 vector registers for temporaries
  1421. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1422. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1423. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1424. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1425. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1426. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1427. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1428. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1429. vfloat32m1_t ACC0r = tmp0r;
  1430. vfloat32m1_t ACC0i = tmp0i;
  1431. vfloat32m1_t ACC1r = tmp1r;
  1432. vfloat32m1_t ACC1i = tmp1i;
  1433. for(BLASLONG k=1; k<pass_K; k++) {
  1434. B0r = B[bi+0*2+0];
  1435. B0i = B[bi+0*2+1];
  1436. B1r = B[bi+1*2+0];
  1437. B1i = B[bi+1*2+1];
  1438. bi += 2*2;
  1439. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1440. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1441. ai += 4*2;
  1442. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1443. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1444. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1445. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1446. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1447. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1448. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1449. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1450. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1451. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1452. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  1453. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  1454. }
  1455. BLASLONG ci=n_top*ldc+m_top;
  1456. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1457. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1458. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  1459. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  1460. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1461. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1462. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  1463. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  1464. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1465. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1466. ci += ldc-gvl*0;
  1467. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  1468. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  1469. m_top += 4;
  1470. }
  1471. if( M & 2 ) {
  1472. gvl = __riscv_vsetvl_e32m1(2);
  1473. BLASLONG ai=m_top*K*2;
  1474. BLASLONG bi=n_top*K*2;
  1475. BLASLONG pass_K = K;
  1476. #ifdef LEFT
  1477. BLASLONG off = offset + m_top;
  1478. #else
  1479. BLASLONG off = -offset + n_top;
  1480. #endif
  1481. #ifdef BACKWARDS
  1482. ai += off*2*2;
  1483. bi += off*2*2;
  1484. pass_K -= off;
  1485. #else
  1486. #ifdef LEFT
  1487. pass_K = off + 2;
  1488. #else
  1489. pass_K = off + 2;
  1490. #endif
  1491. #endif
  1492. float B0r = B[bi+0*2+0];
  1493. float B0i = B[bi+0*2+1];
  1494. float B1r = B[bi+1*2+0];
  1495. float B1i = B[bi+1*2+1];
  1496. bi += 2*2;
  1497. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1498. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1499. ai += 2*2;
  1500. // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
  1501. // leaving 26 vector registers for temporaries
  1502. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1503. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1504. vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1505. vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1506. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1507. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1508. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1509. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1510. vfloat32m1_t ACC0r = tmp0r;
  1511. vfloat32m1_t ACC0i = tmp0i;
  1512. vfloat32m1_t ACC1r = tmp1r;
  1513. vfloat32m1_t ACC1i = tmp1i;
  1514. for(BLASLONG k=1; k<pass_K; k++) {
  1515. B0r = B[bi+0*2+0];
  1516. B0i = B[bi+0*2+1];
  1517. B1r = B[bi+1*2+0];
  1518. B1i = B[bi+1*2+1];
  1519. bi += 2*2;
  1520. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1521. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1522. ai += 2*2;
  1523. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1524. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1525. tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
  1526. tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
  1527. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1528. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1529. tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
  1530. tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
  1531. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1532. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1533. ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
  1534. ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
  1535. }
  1536. BLASLONG ci=n_top*ldc+m_top;
  1537. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1538. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1539. vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
  1540. vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
  1541. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1542. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1543. C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
  1544. C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
  1545. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1546. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1547. ci += ldc-gvl*0;
  1548. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
  1549. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
  1550. m_top += 2;
  1551. }
  1552. if( M & 1 ) {
  1553. float result0 = 0;
  1554. float result1 = 0;
  1555. float result2 = 0;
  1556. float result3 = 0;
  1557. BLASLONG ai=m_top*K*2;
  1558. BLASLONG bi=n_top*K*2;
  1559. BLASLONG pass_K = K;
  1560. #ifdef LEFT
  1561. BLASLONG off = offset + m_top;
  1562. #else
  1563. BLASLONG off = -offset + n_top;
  1564. #endif
  1565. #ifdef BACKWARDS
  1566. ai += off*1*2;
  1567. bi += off*2*2;
  1568. pass_K -= off;
  1569. #else
  1570. #ifdef LEFT
  1571. pass_K = off + 1;
  1572. #else
  1573. pass_K = off + 2;
  1574. #endif
  1575. #endif
  1576. for(BLASLONG k=0; k<pass_K; k++) {
  1577. result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
  1578. result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
  1579. result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
  1580. result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
  1581. ai+=1*2;
  1582. bi+=2*2;
  1583. }
  1584. BLASLONG ci=n_top*ldc+m_top;
  1585. float Cr, Ci;
  1586. Cr = result0*alphar;
  1587. Ci = result1*alphar;
  1588. Cr -= result1*alphai;
  1589. Ci += result0*alphai;
  1590. C[(ci+0*ldc+0)*2+0] = Cr;
  1591. C[(ci+0*ldc+0)*2+1] = Ci;
  1592. Cr = result2*alphar;
  1593. Ci = result3*alphar;
  1594. Cr -= result3*alphai;
  1595. Ci += result2*alphai;
  1596. C[(ci+1*ldc+0)*2+0] = Cr;
  1597. C[(ci+1*ldc+0)*2+1] = Ci;
  1598. m_top+=1;
  1599. }
  1600. n_top += 2;
  1601. }
  1602. // -- tails for N=1
  1603. if( N & 1 ) {
  1604. gvl = __riscv_vsetvl_e32m1(8);
  1605. m_top = 0;
  1606. for (BLASLONG i=0; i<M/8; i+=1) {
  1607. BLASLONG ai=m_top*K*2;
  1608. BLASLONG bi=n_top*K*2;
  1609. BLASLONG pass_K = K;
  1610. #ifdef LEFT
  1611. BLASLONG off = offset + m_top;
  1612. #else
  1613. BLASLONG off = -offset + n_top;
  1614. #endif
  1615. #ifdef BACKWARDS
  1616. ai += off*8*2;
  1617. bi += off*1*2;
  1618. pass_K -= off;
  1619. #else
  1620. #ifdef LEFT
  1621. pass_K = off + 8;
  1622. #else
  1623. pass_K = off + 1;
  1624. #endif
  1625. #endif
  1626. float B0r = B[bi+0*2+0];
  1627. float B0i = B[bi+0*2+1];
  1628. bi += 1*2;
  1629. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1630. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1631. ai += 8*2;
  1632. // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
  1633. // leaving 28 vector registers for temporaries
  1634. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1635. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1636. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1637. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1638. vfloat32m1_t ACC0r = tmp0r;
  1639. vfloat32m1_t ACC0i = tmp0i;
  1640. for(BLASLONG k=1; k<pass_K; k++) {
  1641. B0r = B[bi+0*2+0];
  1642. B0i = B[bi+0*2+1];
  1643. bi += 1*2;
  1644. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1645. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1646. ai += 8*2;
  1647. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1648. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1649. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1650. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1651. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1652. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1653. }
  1654. BLASLONG ci=n_top*ldc+m_top;
  1655. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1656. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1657. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1658. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1659. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1660. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1661. m_top += 8;
  1662. }
  1663. if( M & 4 ) {
  1664. gvl = __riscv_vsetvl_e32m1(4);
  1665. BLASLONG ai=m_top*K*2;
  1666. BLASLONG bi=n_top*K*2;
  1667. BLASLONG pass_K = K;
  1668. #ifdef LEFT
  1669. BLASLONG off = offset + m_top;
  1670. #else
  1671. BLASLONG off = -offset + n_top;
  1672. #endif
  1673. #ifdef BACKWARDS
  1674. ai += off*4*2;
  1675. bi += off*1*2;
  1676. pass_K -= off;
  1677. #else
  1678. #ifdef LEFT
  1679. pass_K = off + 4;
  1680. #else
  1681. pass_K = off + 1;
  1682. #endif
  1683. #endif
  1684. float B0r = B[bi+0*2+0];
  1685. float B0i = B[bi+0*2+1];
  1686. bi += 1*2;
  1687. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1688. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1689. ai += 4*2;
  1690. // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
  1691. // leaving 28 vector registers for temporaries
  1692. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1693. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1694. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1695. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1696. vfloat32m1_t ACC0r = tmp0r;
  1697. vfloat32m1_t ACC0i = tmp0i;
  1698. for(BLASLONG k=1; k<pass_K; k++) {
  1699. B0r = B[bi+0*2+0];
  1700. B0i = B[bi+0*2+1];
  1701. bi += 1*2;
  1702. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1703. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1704. ai += 4*2;
  1705. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1706. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1707. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1708. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1709. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1710. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1711. }
  1712. BLASLONG ci=n_top*ldc+m_top;
  1713. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1714. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1715. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1716. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1717. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1718. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1719. m_top += 4;
  1720. }
  1721. if( M & 2 ) {
  1722. gvl = __riscv_vsetvl_e32m1(2);
  1723. BLASLONG ai=m_top*K*2;
  1724. BLASLONG bi=n_top*K*2;
  1725. BLASLONG pass_K = K;
  1726. #ifdef LEFT
  1727. BLASLONG off = offset + m_top;
  1728. #else
  1729. BLASLONG off = -offset + n_top;
  1730. #endif
  1731. #ifdef BACKWARDS
  1732. ai += off*2*2;
  1733. bi += off*1*2;
  1734. pass_K -= off;
  1735. #else
  1736. #ifdef LEFT
  1737. pass_K = off + 2;
  1738. #else
  1739. pass_K = off + 1;
  1740. #endif
  1741. #endif
  1742. float B0r = B[bi+0*2+0];
  1743. float B0i = B[bi+0*2+1];
  1744. bi += 1*2;
  1745. vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1746. vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1747. ai += 2*2;
  1748. // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
  1749. // leaving 28 vector registers for temporaries
  1750. vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1751. vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1752. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1753. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1754. vfloat32m1_t ACC0r = tmp0r;
  1755. vfloat32m1_t ACC0i = tmp0i;
  1756. for(BLASLONG k=1; k<pass_K; k++) {
  1757. B0r = B[bi+0*2+0];
  1758. B0i = B[bi+0*2+1];
  1759. bi += 1*2;
  1760. A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
  1761. A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
  1762. ai += 2*2;
  1763. tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
  1764. tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
  1765. tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
  1766. tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
  1767. ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
  1768. ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
  1769. }
  1770. BLASLONG ci=n_top*ldc+m_top;
  1771. vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
  1772. vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
  1773. C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
  1774. C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
  1775. __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
  1776. __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
  1777. m_top += 2;
  1778. }
  1779. if( M & 1 ) {
  1780. float result0 = 0;
  1781. float result1 = 0;
  1782. BLASLONG ai=m_top*K*2;
  1783. BLASLONG bi=n_top*K*2;
  1784. BLASLONG pass_K = K;
  1785. #ifdef LEFT
  1786. BLASLONG off = offset + m_top;
  1787. #else
  1788. BLASLONG off = -offset + n_top;
  1789. #endif
  1790. #ifdef BACKWARDS
  1791. ai += off*1*2;
  1792. bi += off*1*2;
  1793. pass_K -= off;
  1794. #else
  1795. #ifdef LEFT
  1796. pass_K = off + 1;
  1797. #else
  1798. pass_K = off + 1;
  1799. #endif
  1800. #endif
  1801. for(BLASLONG k=0; k<pass_K; k++) {
  1802. result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
  1803. result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
  1804. ai+=1*2;
  1805. bi+=1*2;
  1806. }
  1807. BLASLONG ci=n_top*ldc+m_top;
  1808. float Cr, Ci;
  1809. Cr = result0*alphar;
  1810. Ci = result1*alphar;
  1811. Cr -= result1*alphai;
  1812. Ci += result0*alphai;
  1813. C[(ci+0*ldc+0)*2+0] = Cr;
  1814. C[(ci+0*ldc+0)*2+1] = Ci;
  1815. m_top+=1;
  1816. }
  1817. n_top += 1;
  1818. }
  1819. return 0;
  1820. }