|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632 |
- #include "common.h"
- #include <riscv_vector.h>
-
- #define KERNEL16x4_I \
- "addi t1, %[PB], 1*4 \n\t"\
- "addi t2, %[PB], 2*4 \n\t"\
- "addi t3, %[PB], 3*4 \n\t"\
- "flw ft0, (%[PB]) \n\t"\
- "flw ft1, (t1) \n\t"\
- "flw ft2, (t2) \n\t"\
- "flw ft3, (t3) \n\t"\
- "vle.v v0, (%[PA]) \n\t"\
- "addi t4, %[PA], 4*4 \n\t"\
- "addi t5, %[PA], 8*4 \n\t"\
- "vfmv.v.f v8, ft0 \n\t"\
- "addi t6, %[PA], 12*4 \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vle.v v1, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmv.v.f v9, ft1 \n\t"\
- "vle.v v2, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vle.v v3, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "vfmv.v.f v10, ft2 \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "vle.v v4, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vfmv.v.f v11, ft3 \n\t"\
- "vfmacc.vv v16, v8, v0 \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vle.v v5, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmacc.vv v17, v8, v1 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "vle.v v6, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vfmacc.vv v18, v8, v2 \n\t"\
- "addi t3, t3, 4*4 \n\t"\
- "vle.v v7, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "vfmacc.vv v19, v8, v3 \n\t"\
- "flw ft4, (%[PB]) \n\t"\
- "vfmacc.vv v20, v9, v0 \n\t"\
- "flw ft5, (t1) \n\t"\
- "vfmacc.vv v21, v9, v1 \n\t"\
- "flw ft6, (t2) \n\t"\
- "vfmacc.vv v22, v9, v2 \n\t"\
- "flw ft7, (t3) \n\t"\
- "vfmacc.vv v23, v9, v3 \n\t"\
- "vfmv.v.f v12, ft4 \n\t"\
- "vfmacc.vv v24, v10, v0 \n\t"\
- "vfmv.v.f v13, ft5 \n\t"\
- "vfmacc.vv v25, v10, v1 \n\t"\
- "vfmv.v.f v14, ft6 \n\t"\
- "vfmacc.vv v26, v10, v2 \n\t"\
- "vfmv.v.f v15, ft7 \n\t"\
- "vfmacc.vv v27, v10, v3 \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "vfmacc.vv v28, v11, v0 \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vfmacc.vv v29, v11, v1 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "vfmacc.vv v30, v11, v2 \n\t"\
- "addi t3, t3, 4*4 \n\t"\
- "vfmacc.vv v31, v11, v3 \n\t"
-
- #define KERNEL16x4_M1 \
- "vfmacc.vv v16, v8, v0 \n\t"\
- "vle.v v4, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vfmacc.vv v17, v8, v1 \n\t"\
- "vle.v v5, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmacc.vv v18, v8, v2 \n\t"\
- "vle.v v6, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vfmacc.vv v19, v8, v3 \n\t"\
- "vle.v v7, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "vfmacc.vv v20, v9, v0 \n\t"\
- "flw ft4, (%[PB]) \n\t"\
- "vfmacc.vv v21, v9, v1 \n\t"\
- "flw ft5, (t1) \n\t"\
- "vfmacc.vv v22, v9, v2 \n\t"\
- "flw ft6, (t2) \n\t"\
- "vfmacc.vv v23, v9, v3 \n\t"\
- "flw ft7, (t3) \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "vfmacc.vv v24, v10, v0 \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vfmacc.vv v25, v10, v1 \n\t"\
- "vfmv.v.f v12, ft4 \n\t"\
- "vfmacc.vv v26, v10, v2 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "vfmacc.vv v27, v10, v3 \n\t"\
- "vfmv.v.f v13, ft5 \n\t"\
- "vfmacc.vv v28, v11, v0 \n\t"\
- "addi t3, t3, 4*4 \n\t"\
- "vfmacc.vv v29, v11, v1 \n\t"\
- "vfmv.v.f v14, ft6 \n\t"\
- "vfmacc.vv v30, v11, v2 \n\t"\
- "vfmacc.vv v31, v11, v3 \n\t"\
- "vfmv.v.f v15, ft7 \n\t"
-
- #define KERNEL16x4_M2 \
- "vfmacc.vv v16, v12, v4 \n\t"\
- "vle.v v0, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vfmacc.vv v17, v12, v5 \n\t"\
- "vle.v v1, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmacc.vv v18, v12, v6 \n\t"\
- "vle.v v2, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vfmacc.vv v19, v12, v7 \n\t"\
- "vle.v v3, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "vfmacc.vv v20, v13, v4 \n\t"\
- "flw ft0, (%[PB]) \n\t"\
- "vfmacc.vv v21, v13, v5 \n\t"\
- "flw ft1, (t1) \n\t"\
- "vfmacc.vv v22, v13, v6 \n\t"\
- "flw ft2, (t2) \n\t"\
- "vfmacc.vv v23, v13, v7 \n\t"\
- "flw ft3, (t3) \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "vfmacc.vv v24, v14, v4 \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vfmacc.vv v25, v14, v5 \n\t"\
- "vfmv.v.f v8, ft0 \n\t"\
- "vfmacc.vv v26, v14, v6 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "vfmacc.vv v27, v14, v7 \n\t"\
- "vfmv.v.f v9, ft1 \n\t"\
- "vfmacc.vv v28, v15, v4 \n\t"\
- "addi t3, t3, 4*4 \n\t"\
- "vfmacc.vv v29, v15, v5 \n\t"\
- "vfmv.v.f v10, ft2 \n\t"\
- "vfmacc.vv v30, v15, v6 \n\t"\
- "vfmacc.vv v31, v15, v7 \n\t"\
- "vfmv.v.f v11, ft3 \n\t"
-
- #define KERNEL16x4_E \
- "vfmacc.vv v16, v12, v4 \n\t"\
- "vfmacc.vv v17, v12, v5 \n\t"\
- "vfmacc.vv v18, v12, v6 \n\t"\
- "vfmacc.vv v19, v12, v7 \n\t"\
- "vfmacc.vv v20, v13, v4 \n\t"\
- "vfmacc.vv v21, v13, v5 \n\t"\
- "vfmacc.vv v22, v13, v6 \n\t"\
- "vfmacc.vv v23, v13, v7 \n\t"\
- "vfmacc.vv v24, v14, v4 \n\t"\
- "vfmacc.vv v25, v14, v5 \n\t"\
- "vfmacc.vv v26, v14, v6 \n\t"\
- "vfmacc.vv v27, v14, v7 \n\t"\
- "vfmacc.vv v28, v15, v4 \n\t"\
- "vfmacc.vv v29, v15, v5 \n\t"\
- "vfmacc.vv v30, v15, v6 \n\t"\
- "vfmacc.vv v31, v15, v7 \n\t"
-
-
- #define KERNEL8x4_I \
- "addi t1, %[PB], 1*4 \n\t"\
- "addi t2, %[PB], 2*4 \n\t"\
- "addi t3, %[PB], 3*4 \n\t"\
- "flw ft0, (%[PB]) \n\t"\
- "flw ft1, (t1) \n\t"\
- "flw ft2, (t2) \n\t"\
- "flw ft3, (t3) \n\t"\
- "vle.v v0, (%[PA]) \n\t"\
- "addi t4, %[PA], 4*4 \n\t"\
- "vfmv.v.f v8, ft0 \n\t"\
- "addi %[PA], %[PA], 8*4 \n\t"\
- "vle.v v1, (t4) \n\t"\
- "addi t4, t4, 8*4 \n\t"\
- "vfmv.v.f v9, ft1 \n\t"\
- "vfmv.v.f v10, ft2 \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "vle.v v4, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 8*4 \n\t"\
- "vfmv.v.f v11, ft3 \n\t"\
- "vfmacc.vv v16, v8, v0 \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vle.v v5, (t4) \n\t"\
- "addi t4, t4, 8*4 \n\t"\
- "vfmacc.vv v17, v8, v1 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "flw ft4, (%[PB]) \n\t"\
- "addi t3, t3, 4*4 \n\t"\
- "vfmacc.vv v20, v9, v0 \n\t"\
- "flw ft5, (t1) \n\t"\
- "vfmacc.vv v21, v9, v1 \n\t"\
- "flw ft6, (t2) \n\t"\
- "vfmv.v.f v12, ft4 \n\t"\
- "flw ft7, (t3) \n\t"\
- "vfmacc.vv v24, v10, v0 \n\t"\
- "vfmv.v.f v13, ft5 \n\t"\
- "vfmacc.vv v25, v10, v1 \n\t"\
- "vfmv.v.f v14, ft6 \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "vfmv.v.f v15, ft7 \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vfmacc.vv v28, v11, v0 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "vfmacc.vv v29, v11, v1 \n\t"\
- "addi t3, t3, 4*4 \n\t"
-
-
- #define KERNEL8x4_M1 \
- "vfmacc.vv v16, v8, v0 \n\t"\
- "vle.v v4, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 8*4 \n\t"\
- "vfmacc.vv v17, v8, v1 \n\t"\
- "vle.v v5, (t4) \n\t"\
- "addi t4, t4, 8*4 \n\t"\
- "vfmacc.vv v20, v9, v0 \n\t"\
- "flw ft4, (%[PB]) \n\t"\
- "vfmacc.vv v21, v9, v1 \n\t"\
- "flw ft5, (t1) \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "flw ft6, (t2) \n\t"\
- "vfmacc.vv v24, v10, v0 \n\t"\
- "flw ft7, (t3) \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vfmacc.vv v25, v10, v1 \n\t"\
- "vfmv.v.f v12, ft4 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "vfmv.v.f v13, ft5 \n\t"\
- "vfmacc.vv v28, v11, v0 \n\t"\
- "addi t3, t3, 4*4 \n\t"\
- "vfmacc.vv v29, v11, v1 \n\t"\
- "vfmv.v.f v14, ft6 \n\t"\
- "vfmv.v.f v15, ft7 \n\t"
-
- #define KERNEL8x4_M2 \
- "vfmacc.vv v16, v12, v4 \n\t"\
- "vle.v v0, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 8*4 \n\t"\
- "vfmacc.vv v17, v12, v5 \n\t"\
- "vle.v v1, (t4) \n\t"\
- "addi t4, t4, 8*4 \n\t"\
- "vfmacc.vv v20, v13, v4 \n\t"\
- "flw ft0, (%[PB]) \n\t"\
- "vfmacc.vv v21, v13, v5 \n\t"\
- "flw ft1, (t1) \n\t"\
- "addi %[PB], %[PB], 4*4 \n\t"\
- "flw ft2, (t2) \n\t"\
- "vfmacc.vv v24, v14, v4 \n\t"\
- "flw ft3, (t3) \n\t"\
- "addi t1, t1, 4*4 \n\t"\
- "vfmacc.vv v25, v14, v5 \n\t"\
- "vfmv.v.f v8, ft0 \n\t"\
- "addi t2, t2, 4*4 \n\t"\
- "vfmv.v.f v9, ft1 \n\t"\
- "vfmacc.vv v28, v15, v4 \n\t"\
- "addi t3, t3, 4*4 \n\t"\
- "vfmacc.vv v29, v15, v5 \n\t"\
- "vfmv.v.f v10, ft2 \n\t"\
- "vfmv.v.f v11, ft3 \n\t"
-
- #define KERNEL8x4_E \
- "vfmacc.vv v16, v12, v4 \n\t"\
- "vfmacc.vv v17, v12, v5 \n\t"\
- "vfmacc.vv v20, v13, v4 \n\t"\
- "vfmacc.vv v21, v13, v5 \n\t"\
- "vfmacc.vv v24, v14, v4 \n\t"\
- "vfmacc.vv v25, v14, v5 \n\t"\
- "vfmacc.vv v28, v15, v4 \n\t"\
- "vfmacc.vv v29, v15, v5 \n\t"
-
-
- #define KERNEL16x2_I \
- "addi t1, %[PB], 1*4 \n\t"\
- "flw ft0, (%[PB]) \n\t"\
- "flw ft1, (t1) \n\t"\
- "vle.v v0, (%[PA]) \n\t"\
- "addi t4, %[PA], 4*4 \n\t"\
- "addi t5, %[PA], 8*4 \n\t"\
- "vfmv.v.f v8, ft0 \n\t"\
- "addi t6, %[PA], 12*4 \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vle.v v1, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmv.v.f v9, ft1 \n\t"\
- "vle.v v2, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vle.v v3, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "addi %[PB], %[PB], 2*4 \n\t"\
- "vle.v v4, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vfmacc.vv v16, v8, v0 \n\t"\
- "addi t1, t1, 2*4 \n\t"\
- "vle.v v5, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmacc.vv v17, v8, v1 \n\t"\
- "vle.v v6, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vfmacc.vv v18, v8, v2 \n\t"\
- "vle.v v7, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "vfmacc.vv v19, v8, v3 \n\t"\
- "flw ft4, (%[PB]) \n\t"\
- "vfmacc.vv v20, v9, v0 \n\t"\
- "flw ft5, (t1) \n\t"\
- "vfmacc.vv v21, v9, v1 \n\t"\
- "addi %[PB], %[PB], 2*4 \n\t"\
- "vfmacc.vv v22, v9, v2 \n\t"\
- "addi t1, t1, 2*4 \n\t"\
- "vfmacc.vv v23, v9, v3 \n\t"\
- "vfmv.v.f v12, ft4 \n\t"\
- "vfmv.v.f v13, ft5 \n\t"
-
-
- #define KERNEL16x2_M1 \
- "vfmacc.vv v16, v8, v0 \n\t"\
- "vle.v v4, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vfmacc.vv v17, v8, v1 \n\t"\
- "vle.v v5, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmacc.vv v18, v8, v2 \n\t"\
- "vle.v v6, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vfmacc.vv v19, v8, v3 \n\t"\
- "vle.v v7, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "flw ft4, (%[PB]) \n\t"\
- "vfmacc.vv v20, v9, v0 \n\t"\
- "flw ft5, (t1) \n\t"\
- "vfmacc.vv v21, v9, v1 \n\t"\
- "vfmv.v.f v12, ft4 \n\t"\
- "vfmacc.vv v22, v9, v2 \n\t"\
- "addi t1, t1, 2*4 \n\t"\
- "vfmacc.vv v23, v9, v3 \n\t"\
- "addi %[PB], %[PB], 2*4 \n\t"\
- "vfmv.v.f v13, ft5 \n\t"
-
-
- #define KERNEL16x2_M2 \
- "vfmacc.vv v16, v12, v4 \n\t"\
- "vle.v v0, (%[PA]) \n\t"\
- "addi %[PA], %[PA], 16*4 \n\t"\
- "vfmacc.vv v17, v12, v5 \n\t"\
- "vle.v v1, (t4) \n\t"\
- "addi t4, t4, 16*4 \n\t"\
- "vfmacc.vv v18, v12, v6 \n\t"\
- "vle.v v2, (t5) \n\t"\
- "addi t5, t5, 16*4 \n\t"\
- "vfmacc.vv v19, v12, v7 \n\t"\
- "vle.v v3, (t6) \n\t"\
- "addi t6, t6, 16*4 \n\t"\
- "vfmacc.vv v20, v13, v4 \n\t"\
- "flw ft0, (%[PB]) \n\t"\
- "vfmacc.vv v21, v13, v5 \n\t"\
- "flw ft1, (t1) \n\t"\
- "vfmacc.vv v22, v13, v6 \n\t"\
- "vfmv.v.f v8, ft0 \n\t"\
- "vfmacc.vv v23, v13, v7 \n\t"\
- "addi %[PB], %[PB], 2*4 \n\t"\
- "addi t1, t1, 2*4 \n\t"\
- "vfmv.v.f v9, ft1 \n\t"
-
-
- #define KERNEL16x2_E \
- "vfmacc.vv v16, v12, v4 \n\t"\
- "vfmacc.vv v17, v12, v5 \n\t"\
- "vfmacc.vv v18, v12, v6 \n\t"\
- "vfmacc.vv v19, v12, v7 \n\t"\
- "vfmacc.vv v20, v13, v4 \n\t"\
- "vfmacc.vv v21, v13, v5 \n\t"\
- "vfmacc.vv v22, v13, v6 \n\t"\
- "vfmacc.vv v23, v13, v7 \n\t"
-
-
- int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
- #ifdef TRMMKERNEL
- ,BLASLONG offset
- #endif
- )
- {
- BLASLONG i,j,k;
- FLOAT *C0,*C1,*C2,*C3;
- FLOAT *ptrba,*ptrbb, *tmpc;
-
- FLOAT loadb0,loadb1,loadb2,loadb3;
- FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
-
- FLOAT res0,res1,res2,res3;
- FLOAT res4,res5,res6,res7;
- FLOAT res8,res9,res10,res11;
- FLOAT res12,res13,res14,res15;
-
-
- for (j=0; j<bn/4; j+=1){
- C0 = C;
- C1 = C0+ldc;
- C2 = C1+ldc;
- C3 = C2+ldc;
-
- ptrba = ba;
- for(i=0; i<bm/16; i+=1){
- ptrbb = bb;
- //t0 for k
- //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
- //v0-v3,v4-v7 for A, t4-t6 for PA1-3
- //v16-v31 for temp C
-
- asm volatile(
- "vsetvli zero, zero, e32,m1 \n\t"
- "fmv.w.x ft11, zero \n\t"
- "mv t0, %[BK] \n\t"
-
- "vfmv.v.f v16, ft11 \n\t"
- "vfmv.v.f v17, ft11 \n\t"
- "vfmv.v.f v18, ft11 \n\t"
- "vfmv.v.f v19, ft11 \n\t"
-
- "vfmv.v.f v20, ft11 \n\t"
- "vfmv.v.f v21, ft11 \n\t"
- "vfmv.v.f v22, ft11 \n\t"
- "vfmv.v.f v23, ft11 \n\t"
-
- "vfmv.v.f v24, ft11 \n\t"
- "vfmv.v.f v25, ft11 \n\t"
- "vfmv.v.f v26, ft11 \n\t"
- "vfmv.v.f v27, ft11 \n\t"
-
- "vfmv.v.f v28, ft11 \n\t"
- "vfmv.v.f v29, ft11 \n\t"
- "vfmv.v.f v30, ft11 \n\t"
- "vfmv.v.f v31, ft11 \n\t"
- //unloop 8
- "srli t0, %[BK], 3 \n\t"
- "blez t0, M16x4_TAIL \n\t"
-
- //preloop
- KERNEL16x4_I
- KERNEL16x4_M2
- KERNEL16x4_M1
- KERNEL16x4_M2
- "addi t0, t0, -1 \n\t"
- "blez t0, M16x4_MAINLOOP_TAIL \n\t"
- ".align 4 \n\t"
- "M16x4_MAINLOOP: \n\t"
- KERNEL16x4_M1
- KERNEL16x4_M2
- KERNEL16x4_M1
- KERNEL16x4_M2
- KERNEL16x4_M1
- KERNEL16x4_M2
- KERNEL16x4_M1
- KERNEL16x4_M2
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M16x4_MAINLOOP \n\t"
-
- "M16x4_MAINLOOP_TAIL: \n\t"
- KERNEL16x4_M1
- KERNEL16x4_M2
- KERNEL16x4_M1
- KERNEL16x4_E
-
- //tail
- "M16x4_TAIL: \n\t"
- "andi t0, %[BK], 7 \n\t"
- "blez t0, M16x4_SAVERESULT \n\t"
-
- "addi t4, %[PA], 4*4 \n\t"
- "addi t5, %[PA], 8*4 \n\t"
- "addi t6, %[PA], 12*4 \n\t"
- "addi t1, %[PB], 1*4 \n\t"
- "addi t2, %[PB], 2*4 \n\t"
- "addi t3, %[PB], 3*4 \n\t"
-
- ".align 4 \n\t"
- "M16x4_TAILLOOP: \n\t"
- "flw ft0, (%[PB]) \n\t"
- "addi %[PB], %[PB], 4*4 \n\t"
- "vle.v v0, (%[PA]) \n\t"
- "add %[PA], %[PA], 16*4 \n\t"
- "vle.v v1, (t4) \n\t"
- "addi t4, t4, 16*4 \n\t"
-
- "vfmv.v.f v8, ft0 \n\t"
- "flw ft1, (t1) \n\t"
- "addi t1, t1, 4*4 \n\t"
- "vle.v v2, (t5) \n\t"
- "addi t5, t5, 16*4 \n\t"
- "vle.v v3, (t6) \n\t"
- "addi t6, t6, 16*4 \n\t"
-
- "vfmacc.vv v16, v8, v0 \n\t"
- "flw ft2, (t2) \n\t"
- "addi t2, t2, 4*4 \n\t"
- "vfmacc.vv v17, v8, v1 \n\t"
- "vfmacc.vv v18, v8, v2 \n\t"
- "vfmv.v.f v9, ft1 \n\t"
- "vfmacc.vv v19, v8, v3 \n\t"
-
-
- "vfmacc.vv v20, v9, v0 \n\t"
- "flw ft3, (t3) \n\t"
- "addi t3, t3, 4*4 \n\t"
- "vfmacc.vv v21, v9, v1 \n\t"
- "vfmacc.vv v22, v9, v2 \n\t"
- "vfmv.v.f v10, ft2 \n\t"
- "vfmacc.vv v23, v9, v3 \n\t"
-
- "vfmv.v.f v11, ft3 \n\t"
- "vfmacc.vv v24, v10, v0 \n\t"
- "vfmacc.vv v25, v10, v1 \n\t"
- "vfmacc.vv v26, v10, v2 \n\t"
- "vfmacc.vv v27, v10, v3 \n\t"
-
- "vfmacc.vv v28, v11, v0 \n\t"
- "vfmacc.vv v29, v11, v1 \n\t"
- "vfmacc.vv v30, v11, v2 \n\t"
- "vfmacc.vv v31, v11, v3 \n\t"
-
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M16x4_TAILLOOP \n\t"
-
- //Save result
- //load C
- "M16x4_SAVERESULT: \n\t"
- //use v8 to store alpha
- "vfmv.v.f v8, %[ALPHA] \n\t"
- "vle.v v0, (%[C0]) \n\t"
- "addi t4, %[C0], 4*4 \n\t"
- "vle.v v1, (%[C1]) \n\t"
- "addi t5, %[C1], 4*4 \n\t"
- "vle.v v2, (%[C2]) \n\t"
- "addi t6, %[C2], 4*4 \n\t"
- "vle.v v3, (%[C3]) \n\t"
- "addi t3, %[C3], 4*4 \n\t"
-
- //Multiply Alpha
- "vfmacc.vv v0, v8, v16 \n\t"
- "vle.v v4, (t4) \n\t"
- "vfmacc.vv v1, v8, v20 \n\t"
- "vle.v v5, (t5) \n\t"
- "vfmacc.vv v2, v8, v24 \n\t"
- "vle.v v6, (t6) \n\t"
- "vfmacc.vv v3, v8, v28 \n\t"
- "vle.v v7, (t3) \n\t"
-
- "vfmacc.vv v4, v8, v17 \n\t"
- "vse.v v0, (%[C0]) \n\t"
- "add %[C0], %[C0], 8*4 \n\t"
- "vfmacc.vv v5, v8, v21 \n\t"
- "vse.v v1, (%[C1]) \n\t"
- "add %[C1], %[C1], 8*4 \n\t"
-
- "vfmacc.vv v6, v8, v25 \n\t"
- "vse.v v2, (%[C2]) \n\t"
- "add %[C2], %[C2], 8*4 \n\t"
-
- "vfmacc.vv v7, v8, v29 \n\t"
- "vse.v v3, (%[C3]) \n\t"
- "add %[C3], %[C3], 8*4 \n\t"
-
- "vle.v v0, (%[C0]) \n\t"
- "vse.v v4, (t4) \n\t"
- "add t4, t4, 8*4 \n\t"
-
- "vle.v v1, (%[C1]) \n\t"
- "vse.v v5, (t5) \n\t"
- "add t5, t5, 8*4 \n\t"
-
- "vle.v v2, (%[C2]) \n\t"
- "vse.v v6, (t6) \n\t"
- "add t6, t6, 8*4 \n\t"
-
- "vle.v v3, (%[C3]) \n\t"
- "vse.v v7, (t3) \n\t"
- "add t3, t3, 8*4 \n\t"
-
-
- "vfmacc.vv v0, v8, v18 \n\t"
- "vle.v v4, (t4) \n\t"
- "vfmacc.vv v1, v8, v22 \n\t"
- "vle.v v5, (t5) \n\t"
- "vfmacc.vv v2, v8, v26 \n\t"
- "vle.v v6, (t6) \n\t"
- "vfmacc.vv v3, v8, v30 \n\t"
- "vle.v v7, (t3) \n\t"
-
- "vfmacc.vv v4, v8, v19 \n\t"
- "vse.v v0, (%[C0]) \n\t"
- "add %[C0], %[C0], 8*4 \n\t"
-
- "vfmacc.vv v5, v8, v23 \n\t"
- "vse.v v1, (%[C1]) \n\t"
- "add %[C1], %[C1], 8*4 \n\t"
-
- "vfmacc.vv v6, v8, v27 \n\t"
- "vse.v v2, (%[C2]) \n\t"
- "add %[C2], %[C2], 8*4 \n\t"
-
- "vfmacc.vv v7, v8, v31 \n\t"
- "vse.v v3, (%[C3]) \n\t"
- "add %[C3], %[C3], 8*4 \n\t"
-
- "vse.v v4, (t4) \n\t"
- "vse.v v5, (t5) \n\t"
- "vse.v v6, (t6) \n\t"
- "vse.v v7, (t3) \n\t"
- "M16x4_END: \n\t"
-
- :[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
- [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
- :[ALPHA]"f"(alpha), [BK]"r"(bk)
- :"cc", "t0", "t4","t5","t6","t3","t1","t2",
- "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
- "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
- "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
- "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
- "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
- }
- if(bm&8){
- ptrbb = bb;
- //t0 for k
- //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
- //v0-v3,v4-v7 for A, t4-t6 for PA1-3
- //v16-v31 for temp C
-
- asm volatile(
- "vsetvli zero, zero, e32,m1 \n\t"
- "fmv.w.x ft11, zero \n\t"
- "mv t0, %[BK] \n\t"
-
- "vfmv.v.f v16, ft11 \n\t"
- "vfmv.v.f v17, ft11 \n\t"
-
- "vfmv.v.f v20, ft11 \n\t"
- "vfmv.v.f v21, ft11 \n\t"
-
- "vfmv.v.f v24, ft11 \n\t"
- "vfmv.v.f v25, ft11 \n\t"
-
- "vfmv.v.f v28, ft11 \n\t"
- "vfmv.v.f v29, ft11 \n\t"
-
- //unloop 8
- "srli t0, %[BK], 3 \n\t"
- "blez t0, M8x4_TAIL \n\t"
-
- //preloop
- KERNEL8x4_I
- KERNEL8x4_M2
- KERNEL8x4_M1
- KERNEL8x4_M2
- "addi t0, t0, -1 \n\t"
- "blez t0, M8x4_MAINLOOP_TAIL \n\t"
- ".align 4 \n\t"
- "M8x4_MAINLOOP: \n\t"
- KERNEL8x4_M1
- KERNEL8x4_M2
- KERNEL8x4_M1
- KERNEL8x4_M2
- KERNEL8x4_M1
- KERNEL8x4_M2
- KERNEL8x4_M1
- KERNEL8x4_M2
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M8x4_MAINLOOP \n\t"
-
- "M8x4_MAINLOOP_TAIL: \n\t"
- KERNEL8x4_M1
- KERNEL8x4_M2
- KERNEL8x4_M1
- KERNEL8x4_E
-
- //tail
- "M8x4_TAIL: \n\t"
- "andi t0, %[BK], 7 \n\t"
- "blez t0, M8x4_SAVERESULT \n\t"
-
- "addi t4, %[PA], 4*4 \n\t"
-
- "addi t1, %[PB], 1*4 \n\t"
- "addi t2, %[PB], 2*4 \n\t"
- "addi t3, %[PB], 3*4 \n\t"
-
- ".align 4 \n\t"
- "M8x4_TAILLOOP: \n\t"
- "flw ft0, (%[PB]) \n\t"
- "addi %[PB], %[PB], 4*4 \n\t"
- "vle.v v0, (%[PA]) \n\t"
- "add %[PA], %[PA], 8*4 \n\t"
- "vle.v v1, (t4) \n\t"
- "addi t4, t4, 8*4 \n\t"
-
- "vfmv.v.f v8, ft0 \n\t"
- "flw ft1, (t1) \n\t"
- "addi t1, t1, 4*4 \n\t"
-
- "vfmacc.vv v16, v8, v0 \n\t"
- "flw ft2, (t2) \n\t"
- "addi t2, t2, 4*4 \n\t"
- "vfmacc.vv v17, v8, v1 \n\t"
- "vfmv.v.f v9, ft1 \n\t"
-
- "vfmacc.vv v20, v9, v0 \n\t"
- "flw ft3, (t3) \n\t"
- "addi t3, t3, 4*4 \n\t"
- "vfmacc.vv v21, v9, v1 \n\t"
- "vfmv.v.f v10, ft2 \n\t"
-
- "vfmv.v.f v11, ft3 \n\t"
- "vfmacc.vv v24, v10, v0 \n\t"
- "vfmacc.vv v25, v10, v1 \n\t"
-
- "vfmacc.vv v28, v11, v0 \n\t"
- "vfmacc.vv v29, v11, v1 \n\t"
-
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M8x4_TAILLOOP \n\t"
-
- //Save result
- //load C
- "M8x4_SAVERESULT: \n\t"
- //use v8 to store alpha
- "vfmv.v.f v8, %[ALPHA] \n\t"
- "vle.v v0, (%[C0]) \n\t"
- "addi t4, %[C0], 4*4 \n\t"
- "vle.v v1, (%[C1]) \n\t"
- "addi t5, %[C1], 4*4 \n\t"
- "vle.v v2, (%[C2]) \n\t"
- "addi t6, %[C2], 4*4 \n\t"
- "vle.v v3, (%[C3]) \n\t"
- "addi t3, %[C3], 4*4 \n\t"
-
- //Multiply Alpha
- "vfmacc.vv v0, v8, v16 \n\t"
- "vle.v v4, (t4) \n\t"
- "vfmacc.vv v1, v8, v20 \n\t"
- "vle.v v5, (t5) \n\t"
- "vfmacc.vv v2, v8, v24 \n\t"
- "vle.v v6, (t6) \n\t"
- "vfmacc.vv v3, v8, v28 \n\t"
- "vle.v v7, (t3) \n\t"
-
- "vfmacc.vv v4, v8, v17 \n\t"
- "vse.v v0, (%[C0]) \n\t"
- "add %[C0], %[C0], 8*4 \n\t"
- "vfmacc.vv v5, v8, v21 \n\t"
- "vse.v v1, (%[C1]) \n\t"
- "add %[C1], %[C1], 8*4 \n\t"
-
- "vfmacc.vv v6, v8, v25 \n\t"
- "vse.v v2, (%[C2]) \n\t"
- "add %[C2], %[C2], 8*4 \n\t"
-
- "vfmacc.vv v7, v8, v29 \n\t"
- "vse.v v3, (%[C3]) \n\t"
- "add %[C3], %[C3], 8*4 \n\t"
-
- "vse.v v4, (t4) \n\t"
- "vse.v v5, (t5) \n\t"
- "vse.v v6, (t6) \n\t"
- "vse.v v7, (t3) \n\t"
- "M8x4_END: \n\t"
-
- :[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
- [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
- :[ALPHA]"f"(alpha), [BK]"r"(bk)
- :"cc", "t0", "t4","t5","t6","t3","t1","t2",
- "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
- "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
- "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
- "v16", "v17", "v20", "v21",
- "v24", "v25", "v28", "v29");
- }
- if(bm&4){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- res4 = 0;
- res5 = 0;
- res6 = 0;
- res7 = 0;
- res8 = 0;
- res9 = 0;
- res10 = 0;
- res11 = 0;
- res12 = 0;
- res13 = 0;
- res14 = 0;
- res15 = 0;
-
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- loadb1 = ptrbb[1];
-
- load0 = ptrba[0];
- load1 = ptrba[1];
- load2 = ptrba[2];
- load3 = ptrba[3];
-
- res0 = res0 + load0 * loadb0;
- res1 = res1 + load1 * loadb0;
- res2 = res2 + load2 * loadb0;
- res3 = res3 + load3 * loadb0;
-
- res4 = res4 + load0 * loadb1;
- res5 = res5 + load1 * loadb1;
- res6 = res6 + load2 * loadb1;
- res7 = res7 + load3 * loadb1;
-
- loadb2 = ptrbb[2];
- loadb3 = ptrbb[3];
-
- res8 = res8 + load0 * loadb2;
- res9 = res9 + load1 * loadb2;
- res10 = res10 + load2 * loadb2;
- res11 = res11 + load3 * loadb2;
-
- res12 = res12 + load0 * loadb3;
- res13 = res13 + load1 * loadb3;
- res14 = res14 + load2 * loadb3;
- res15 = res15 + load3 * loadb3;
-
- ptrba += 4;
- ptrbb += 4;
- }
-
- res0 = res0 * alpha;
- res1 = res1 * alpha;
- res2 = res2 * alpha;
- res3 = res3 * alpha;
- res4 = res4 * alpha;
- res5 = res5 * alpha;
- res6 = res6 * alpha;
- res7 = res7 * alpha;
-
- res8 = res8 * alpha;
- res9 = res9 * alpha;
- res10 = res10 * alpha;
- res11 = res11 * alpha;
- res12 = res12 * alpha;
- res13 = res13 * alpha;
- res14 = res14 * alpha;
- res15 = res15 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
- C0[2] += res2;
- C0[3] += res3;
-
- C1[0] += res4;
- C1[1] += res5;
- C1[2] += res6;
- C1[3] += res7;
-
- C2[0] += res8;
- C2[1] += res9;
- C2[2] += res10;
- C2[3] += res11;
-
- C3[0] += res12;
- C3[1] += res13;
- C3[2] += res14;
- C3[3] += res15;
-
- C0 += 4;
- C1 += 4;
- C2 += 4;
- C3 += 4;
- }
- if(bm&2){
- ptrbb = bb;
-
- res0 = 0;
- res1 = 0;
-
- res4 = 0;
- res5 = 0;
-
- res8 = 0;
- res9 = 0;
-
- res12 = 0;
- res13 = 0;
-
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- loadb1 = ptrbb[1];
-
- load0 = ptrba[0];
- load1 = ptrba[1];
-
- res0 = res0 + load0 * loadb0;
- res1 = res1 + load1 * loadb0;
-
- res4 = res4 + load0 * loadb1;
- res5 = res5 + load1 * loadb1;
-
- loadb2 = ptrbb[2];
- loadb3 = ptrbb[3];
-
- res8 = res8 + load0 * loadb2;
- res9 = res9 + load1 * loadb2;
-
- res12 = res12 + load0 * loadb3;
- res13 = res13 + load1 * loadb3;
-
- ptrba += 2;
- ptrbb += 4;
- }
-
- res0 = res0 * alpha;
- res1 = res1 * alpha;
-
- res4 = res4 * alpha;
- res5 = res5 * alpha;
-
- res8 = res8 * alpha;
- res9 = res9 * alpha;
-
- res12 = res12 * alpha;
- res13 = res13 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
-
- C1[0] += res4;
- C1[1] += res5;
-
- C2[0] += res8;
- C2[1] += res9;
-
- C3[0] += res12;
- C3[1] += res13;
-
- C0 += 2;
- C1 += 2;
- C2 += 2;
- C3 += 2;
- }
- if(bm&1){
- ptrbb = bb;
- //t0 for k
- //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
- //v0-v3,v4-v7 for A, t4-t6 for PA1-3
- //v16-v31 for temp C
-
- FLOAT tmp[4];
- tmpc=tmp;
- //t1-t3 for PB
- //v0-v4 for A, v8-v11 for B
- //v16-v19 for C
- asm volatile(
- "vsetvli zero, zero, e32,m1 \n\t"
- "fmv.w.x ft11, zero \n\t"
-
- "vfmv.v.f v16, ft11 \n\t"
- "vfmv.v.f v17, ft11 \n\t"
- "vfmv.v.f v18, ft11 \n\t"
- "vfmv.v.f v19, ft11 \n\t"
- //unloop 4
-
- "srli t0, %[BK], 2 \n\t"
- "blez t0, M1x4_TAIL \n\t"
-
- "addi t1, %[PB], 4*4 \n\t"
- "addi t2, %[PB], 8*4 \n\t"
- "addi t3, %[PB], 12*4 \n\t"
-
- ".align 4 \n\t"
- "M1x4_MAINLOOP: \n\t"
-
- "vle.v v4, (%[PA]) \n\t"
- "addi %[PA], %[PA], 4*4 \n\t"
- "vrgather.vi v0, v4, 0 \n\t"
-
- "vle.v v8, (%[PB]) \n\t"
- "addi %[PB], %[PB], 16*4 \n\t"
- "vrgather.vi v1, v4, 1 \n\t"
-
- "vle.v v9, (t1) \n\t"
- "addi t1, t1, 16*4 \n\t"
- "vrgather.vi v2, v4, 2 \n\t"
-
- "vle.v v10, (t2) \n\t"
- "addi t2, t2, 16*4 \n\t"
- "vrgather.vi v3, v4, 3 \n\t"
-
- "vle.v v11, (t3) \n\t"
- "addi t3, t3, 16*4 \n\t"
-
- "vfmacc.vv v16, v8, v0 \n\t"
- "vfmacc.vv v17, v9, v1 \n\t"
- "vfmacc.vv v18, v10, v2 \n\t"
- "vfmacc.vv v19, v11, v3 \n\t"
-
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M1x4_MAINLOOP \n\t"
-
- "M1x4_TAIL: \n\t"
- "andi t0, %[BK], 3 \n\t"
- "blez t0, M1x4_SAVERESULT \n\t"
-
- "M1x4_TAILLOOP: \n\t"
- "flw ft0, (%[PA]) \n\t"
- "addi %[PA], %[PA], 1*4 \n\t"
- "vle.v v8, (%[PB]) \n\t"
- "addi %[PB], %[PB], 4*4 \n\t"
- "vfmv.v.f v0, ft0 \n\t"
- "vfmacc.vv v16, v8, v0 \n\t"
-
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M1x4_TAILLOOP \n\t"
-
- "M1x4_SAVERESULT: \n\t"
- //merge v16-v19
- "vfadd.vv v16, v16, v17 \n\t"
- "vfadd.vv v18, v18, v19 \n\t"
- "vfadd.vv v16, v16, v18 \n\t"
-
- "vfmv.v.f v8, %[ALPHA] \n\t"
- "vfmul.vv v16, v8, v16 \n\t"
- "vse.v v16, (%[TMP_C]) \n\t"
- "M1x4_END: \n\t"
- :[TMP_C]"+r"(tmpc),
- [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
- :[ALPHA]"f"(alpha), [BK]"r"(bk)
- :"cc", "t0", "t3","t1","t2",
- "ft0", "ft11",
- "v0", "v1", "v2", "v3","v4",
- "v8", "v9", "v10", "v11",
- "v16", "v17","v18", "v19"
- );
-
- C0[0] += tmp[0];
- C1[0] += tmp[1];
- C2[0] += tmp[2];
- C3[0] += tmp[3];
-
- /* don't need move c point
- C0 += 1;
- C1 += 1;
- C2 += 1;
- C3 += 1;
- */
- }
-
- k = bk<<2;
- bb = bb+k;
- i = ldc<<2;
- C = C+i;
- }
-
- if(bn&2){
- C0 = C;
- C1 = C0+ldc;
-
- ptrba = ba;
- for(i=0; i<bm/16; i+=1){
- ptrbb = bb;
- asm volatile(
- "vsetvli zero, zero, e32,m1 \n\t"
- "fmv.w.x ft11, zero \n\t"
- "mv t0, %[BK] \n\t"
-
- "vfmv.v.f v16, ft11 \n\t"
- "vfmv.v.f v17, ft11 \n\t"
- "vfmv.v.f v18, ft11 \n\t"
- "vfmv.v.f v19, ft11 \n\t"
-
- "vfmv.v.f v20, ft11 \n\t"
- "vfmv.v.f v21, ft11 \n\t"
- "vfmv.v.f v22, ft11 \n\t"
- "vfmv.v.f v23, ft11 \n\t"
-
- //unloop 8
- "srli t0, %[BK], 3 \n\t"
- "blez t0, M16x2_TAIL \n\t"
-
- //preloop
- KERNEL16x2_I
- KERNEL16x2_M2
- KERNEL16x2_M1
- KERNEL16x2_M2
- "addi t0, t0, -1 \n\t"
- "blez t0, M16x2_MAINLOOP_TAIL \n\t"
- ".align 4 \n\t"
- "M16x2_MAINLOOP: \n\t"
- KERNEL16x2_M1
- KERNEL16x2_M2
- KERNEL16x2_M1
- KERNEL16x2_M2
- KERNEL16x2_M1
- KERNEL16x2_M2
- KERNEL16x2_M1
- KERNEL16x2_M2
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M16x2_MAINLOOP \n\t"
-
- "M16x2_MAINLOOP_TAIL: \n\t"
- KERNEL16x2_M1
- KERNEL16x2_M2
- KERNEL16x2_M1
- KERNEL16x2_E
-
- //tail
- "M16x2_TAIL: \n\t"
- "andi t0, %[BK], 7 \n\t"
- "blez t0, M16x2_SAVERESULT \n\t"
-
- "addi t4, %[PA], 4*4 \n\t"
- "addi t5, %[PA], 8*4 \n\t"
- "addi t6, %[PA], 12*4 \n\t"
- "addi t1, %[PB], 1*4 \n\t"
-
- ".align 4 \n\t"
- "M16x2_TAILLOOP: \n\t"
- "flw ft0, (%[PB]) \n\t"
- "addi %[PB], %[PB], 2*4 \n\t"
- "vle.v v0, (%[PA]) \n\t"
- "add %[PA], %[PA], 16*4 \n\t"
- "vle.v v1, (t4) \n\t"
- "addi t4, t4, 16*4 \n\t"
-
- "vfmv.v.f v8, ft0 \n\t"
- "flw ft1, (t1) \n\t"
- "addi t1, t1, 2*4 \n\t"
- "vle.v v2, (t5) \n\t"
- "addi t5, t5, 16*4 \n\t"
- "vle.v v3, (t6) \n\t"
- "addi t6, t6, 16*4 \n\t"
-
- "vfmv.v.f v9, ft1 \n\t"
- "vfmacc.vv v16, v8, v0 \n\t"
- "vfmacc.vv v17, v8, v1 \n\t"
- "vfmacc.vv v18, v8, v2 \n\t"
- "vfmacc.vv v19, v8, v3 \n\t"
-
- "vfmacc.vv v20, v9, v0 \n\t"
- "vfmacc.vv v21, v9, v1 \n\t"
- "vfmacc.vv v22, v9, v2 \n\t"
- "vfmacc.vv v23, v9, v3 \n\t"
-
- "addi t0, t0, -1 \n\t"
- "bgtz t0, M16x2_TAILLOOP \n\t"
-
- //Save result
- //load C
- "M16x2_SAVERESULT: \n\t"
- //use v8 to store alpha
- "vfmv.v.f v8, %[ALPHA] \n\t"
- "vle.v v0, (%[C0]) \n\t"
- "addi t4, %[C0], 4*4 \n\t"
- "vle.v v1, (%[C1]) \n\t"
- "addi t5, %[C1], 4*4 \n\t"
-
- //Multiply Alpha
- "vfmacc.vv v0, v8, v16 \n\t"
- "vle.v v4, (t4) \n\t"
- "vfmacc.vv v1, v8, v20 \n\t"
- "vle.v v5, (t5) \n\t"
-
- "vfmacc.vv v4, v8, v17 \n\t"
- "vse.v v0, (%[C0]) \n\t"
- "add %[C0], %[C0], 8*4 \n\t"
- "vfmacc.vv v5, v8, v21 \n\t"
- "vse.v v1, (%[C1]) \n\t"
- "add %[C1], %[C1], 8*4 \n\t"
-
- "vle.v v0, (%[C0]) \n\t"
- "vse.v v4, (t4) \n\t"
- "add t4, t4, 8*4 \n\t"
-
- "vle.v v1, (%[C1]) \n\t"
- "vse.v v5, (t5) \n\t"
- "add t5, t5, 8*4 \n\t"
-
- "vfmacc.vv v0, v8, v18 \n\t"
- "vle.v v4, (t4) \n\t"
- "vfmacc.vv v1, v8, v22 \n\t"
- "vle.v v5, (t5) \n\t"
-
- "vfmacc.vv v4, v8, v19 \n\t"
- "vse.v v0, (%[C0]) \n\t"
- "add %[C0], %[C0], 8*4 \n\t"
-
- "vfmacc.vv v5, v8, v23 \n\t"
- "vse.v v1, (%[C1]) \n\t"
- "add %[C1], %[C1], 8*4 \n\t"
-
- "vse.v v4, (t4) \n\t"
- "vse.v v5, (t5) \n\t"
- "M16x2_END: \n\t"
-
- :[C0]"+r"(C0),[C1]"+r"(C1),
- [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
- :[ALPHA]"f"(alpha), [BK]"r"(bk)
- :"cc", "t0", "t4","t5","t6","t3","t1","t2",
- "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
- "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
- "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
- "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
-
- }
- if(bm&8){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- res4 = 0;
- res5 = 0;
- res6 = 0;
- res7 = 0;
- res8 = 0;
- res9 = 0;
- res10 = 0;
- res11 = 0;
- res12 = 0;
- res13 = 0;
- res14 = 0;
- res15 = 0;
-
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- loadb1 = ptrbb[1];
-
- load0 = ptrba[0];
- load1 = ptrba[1];
- load2 = ptrba[2];
- load3 = ptrba[3];
- load4 = ptrba[4];
- load5 = ptrba[5];
- load6 = ptrba[6];
- load7 = ptrba[7];
-
- res0 = res0 + load0 * loadb0;
- res1 = res1 + load1 * loadb0;
- res2 = res2 + load2 * loadb0;
- res3 = res3 + load3 * loadb0;
-
- res4 = res4 + load4 * loadb0;
- res5 = res5 + load5 * loadb0;
- res6 = res6 + load6 * loadb0;
- res7 = res7 + load7 * loadb0;
-
- res8 = res8 + load0 * loadb1;
- res9 = res9 + load1 * loadb1;
- res10 = res10 + load2 * loadb1;
- res11 = res11 + load3 * loadb1;
-
- res12 = res12 + load4 * loadb1;
- res13 = res13 + load5 * loadb1;
- res14 = res14 + load6 * loadb1;
- res15 = res15 + load7 * loadb1;
-
- ptrba += 8;
- ptrbb += 2;
- }
-
- res0 = res0 * alpha;
- res1 = res1 * alpha;
- res2 = res2 * alpha;
- res3 = res3 * alpha;
- res4 = res4 * alpha;
- res5 = res5 * alpha;
- res6 = res6 * alpha;
- res7 = res7 * alpha;
-
- res8 = res8 * alpha;
- res9 = res9 * alpha;
- res10 = res10 * alpha;
- res11 = res11 * alpha;
- res12 = res12 * alpha;
- res13 = res13 * alpha;
- res14 = res14 * alpha;
- res15 = res15 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
- C0[2] += res2;
- C0[3] += res3;
- C0[4] += res4;
- C0[5] += res5;
- C0[6] += res6;
- C0[7] += res7;
-
- C1[0] += res8;
- C1[1] += res9;
- C1[2] += res10;
- C1[3] += res11;
- C1[4] += res12;
- C1[5] += res13;
- C1[6] += res14;
- C1[7] += res15;
-
- C0 += 8;
- C1 += 8;
- }
- if(bm&4){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
-
- res8 = 0;
- res9 = 0;
- res10 = 0;
- res11 = 0;
-
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- loadb1 = ptrbb[1];
-
- load0 = ptrba[0];
- load1 = ptrba[1];
- load2 = ptrba[2];
- load3 = ptrba[3];
-
- res0 = res0 + load0 * loadb0;
- res1 = res1 + load1 * loadb0;
- res2 = res2 + load2 * loadb0;
- res3 = res3 + load3 * loadb0;
-
- res8 = res8 + load0 * loadb1;
- res9 = res9 + load1 * loadb1;
- res10 = res10 + load2 * loadb1;
- res11 = res11 + load3 * loadb1;
-
- ptrba += 4;
- ptrbb += 2;
- }
-
- res0 = res0 * alpha;
- res1 = res1 * alpha;
- res2 = res2 * alpha;
- res3 = res3 * alpha;
-
- res8 = res8 * alpha;
- res9 = res9 * alpha;
- res10 = res10 * alpha;
- res11 = res11 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
- C0[2] += res2;
- C0[3] += res3;
-
- C1[0] += res8;
- C1[1] += res9;
- C1[2] += res10;
- C1[3] += res11;
-
- C0 += 4;
- C1 += 4;
- }
- if(bm&2){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
-
- res8 = 0;
- res9 = 0;
-
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- loadb1 = ptrbb[1];
-
- load0 = ptrba[0];
- load1 = ptrba[1];
-
- res0 = res0 + load0 * loadb0;
- res1 = res1 + load1 * loadb0;
-
- res8 = res8 + load0 * loadb1;
- res9 = res9 + load1 * loadb1;
-
- ptrba += 2;
- ptrbb += 2;
- }
-
- res0 = res0 * alpha;
- res1 = res1 * alpha;
-
- res8 = res8 * alpha;
- res9 = res9 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
-
- C1[0] += res8;
- C1[1] += res9;
-
- C0 += 2;
- C1 += 2;
- }
- if(bm&1){
- ptrbb = bb;
- res0 = 0;
- res8 = 0;
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- loadb1 = ptrbb[1];
- load0 = ptrba[0];
-
- res0 = res0 + load0 * loadb0;
- res8 = res8 + load0 * loadb1;
- ptrba += 1;
- ptrbb += 2;
- }
-
- res0 = res0 * alpha;
- res8 = res8 * alpha;
-
- C0[0] += res0;
- C1[0] += res8;
-
- C0 += 1;
- C1 += 1;
- }
- k = bk<<1;
- bb = bb+k;
- i = ldc<<1;
- C = C+i;
- }
-
- if (bn&1){
- C0 = C;
- ptrba = ba;
- for(i=0; i<bm/16; i+=1){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- res4 = 0;
- res5 = 0;
- res6 = 0;
- res7 = 0;
-
- res8 = 0;
- res9 = 0;
- res10 = 0;
- res11 = 0;
- res12 = 0;
- res13 = 0;
- res14 = 0;
- res15 = 0;
-
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- res0 = res0 + ptrba[0] * loadb0;
- res1 = res1 + ptrba[1] * loadb0;
- res2 = res2 + ptrba[2] * loadb0;
- res3 = res3 + ptrba[3] * loadb0;
-
- res4 = res4 + ptrba[4] * loadb0;
- res5 = res5 + ptrba[5] * loadb0;
- res6 = res6 + ptrba[6] * loadb0;
- res7 = res7 + ptrba[7] * loadb0;
-
- res8 = res8 + ptrba[8] * loadb0;
- res9 = res9 + ptrba[9] * loadb0;
- res10 = res10 + ptrba[10] * loadb0;
- res11 = res11 + ptrba[11] * loadb0;
-
- res12 = res12 + ptrba[12] * loadb0;
- res13 = res13 + ptrba[13] * loadb0;
- res14 = res14 + ptrba[14] * loadb0;
- res15 = res15 + ptrba[15] * loadb0;
-
- ptrba += 16;
- ptrbb += 1;
- }
- res0 = res0 * alpha;
- res1 = res1 * alpha;
- res2 = res2 * alpha;
- res3 = res3 * alpha;
- res4 = res4 * alpha;
- res5 = res5 * alpha;
- res6 = res6 * alpha;
- res7 = res7 * alpha;
-
- res8 = res8 * alpha;
- res9 = res9 * alpha;
- res10 = res10 * alpha;
- res11 = res11 * alpha;
- res12 = res12 * alpha;
- res13 = res13 * alpha;
- res14 = res14 * alpha;
- res15 = res15 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
- C0[2] += res2;
- C0[3] += res3;
- C0[4] += res4;
- C0[5] += res5;
- C0[6] += res6;
- C0[7] += res7;
-
- C0[8] += res8;
- C0[9] += res9;
- C0[10] += res10;
- C0[11] += res11;
- C0[12] += res12;
- C0[13] += res13;
- C0[14] += res14;
- C0[15] += res15;
-
- C0 += 16;
-
- }
-
- if(bm&8){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- res4 = 0;
- res5 = 0;
- res6 = 0;
- res7 = 0;
-
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- res0 = res0 + ptrba[0] * loadb0;
- res1 = res1 + ptrba[1] * loadb0;
- res2 = res2 + ptrba[2] * loadb0;
- res3 = res3 + ptrba[3] * loadb0;
-
- res4 = res4 + ptrba[4] * loadb0;
- res5 = res5 + ptrba[5] * loadb0;
- res6 = res6 + ptrba[6] * loadb0;
- res7 = res7 + ptrba[7] * loadb0;
-
- ptrba += 8;
- ptrbb += 1;
- }
- res0 = res0 * alpha;
- res1 = res1 * alpha;
- res2 = res2 * alpha;
- res3 = res3 * alpha;
- res4 = res4 * alpha;
- res5 = res5 * alpha;
- res6 = res6 * alpha;
- res7 = res7 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
- C0[2] += res2;
- C0[3] += res3;
- C0[4] += res4;
- C0[5] += res5;
- C0[6] += res6;
- C0[7] += res7;
-
- C0 += 8;
- }
- if(bm&4){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
- res2 = 0;
- res3 = 0;
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- res0 = res0 + ptrba[0] * loadb0;
- res1 = res1 + ptrba[1] * loadb0;
- res2 = res2 + ptrba[2] * loadb0;
- res3 = res3 + ptrba[3] * loadb0;
-
- ptrba += 4;
- ptrbb += 1;
- }
- res0 = res0 * alpha;
- res1 = res1 * alpha;
- res2 = res2 * alpha;
- res3 = res3 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
- C0[2] += res2;
- C0[3] += res3;
-
- C0 += 4;
- }
- if(bm&2){
- ptrbb = bb;
- res0 = 0;
- res1 = 0;
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- res0 = res0 + ptrba[0] * loadb0;
- res1 = res1 + ptrba[1] * loadb0;
-
- ptrba += 2;
- ptrbb += 1;
- }
- res0 = res0 * alpha;
- res1 = res1 * alpha;
-
- C0[0] += res0;
- C0[1] += res1;
-
- C0 += 2;
- }
- if(bm&1){
- ptrbb = bb;
- res0 = 0;
- for(k=0; k<bk; k+=1){
- loadb0 = ptrbb[0];
- res0 = res0 + ptrba[0] * loadb0;
- ptrba += 1;
- ptrbb += 1;
- }
- res0 = res0 * alpha;
- C0[0] += res0;
- C0 += 1;
- }
-
- k = bk;
- bb = bb+k;
- C = C+ldc;
- }
- return 0;
- }
|