You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

inference.cpp 201 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952
  1. /**
  2. * \file src/gopt/test/inference.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/opr/dnn/local.h"
  13. #include "megbrain/test/helper.h"
  14. #include "megbrain/gopt/basic_arith.h"
  15. #include "megbrain/gopt/gtrans.h"
  16. #include "megbrain/gopt/inference.h"
  17. #include "megbrain/opr/basic_arith_wrapper.h"
  18. #include "megbrain/opr/blas.h"
  19. #include "megbrain/opr/dnn/batch_norm.h"
  20. #include "megbrain/opr/dnn/convolution.h"
  21. #include "megbrain/opr/dnn/pooling.h"
  22. #include "megbrain/opr/imgproc.h"
  23. #include "megbrain/opr/io.h"
  24. #include "megbrain/opr/nn_int.h"
  25. #include "megbrain/opr/tensor_gen.h"
  26. #include "megbrain/opr/tensor_manip.h"
  27. #include "megbrain/opr/utility.h"
  28. #include "./helper.h"
  29. #include "megbrain/comp_node_env.h"
  30. #include "megdnn/tensor_format.h"
  31. #include <random>
  32. #include <vector>
  33. #if MGB_CUDA
  34. #include <cudnn.h>
  35. #endif
  36. using namespace mgb;
  37. namespace {
  38. //! find first the operator of specific type; raise exception if not found
  39. template <typename T>
  40. T& find_opr(SymbolVar endpoint) {
  41. T* found = nullptr;
  42. auto cb = [&found](cg::OperatorNodeBase* opr) {
  43. if (!found && opr->same_type<T>()) {
  44. found = &opr->cast_final_safe<T>();
  45. }
  46. };
  47. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  48. mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str());
  49. return *found;
  50. }
  51. template <typename T>
  52. T& find_opr(SymbolVar endpoint, const std::string& node_name) {
  53. T* found = nullptr;
  54. auto cb = [&found, &node_name](cg::OperatorNodeBase* opr) {
  55. if (!found && opr->same_type<T>() && opr->name() == node_name) {
  56. found = &opr->cast_final_safe<T>();
  57. }
  58. };
  59. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  60. mgb_assert(
  61. found, "not found opr %s from %s", node_name.c_str(),
  62. endpoint.node()->name().c_str());
  63. return *found;
  64. }
  65. template <typename T>
  66. size_t find_opr_num(SymbolVar endpoint) {
  67. size_t opr_num = 0;
  68. auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
  69. if (opr->same_type<T>()) {
  70. opr_num++;
  71. }
  72. };
  73. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  74. return opr_num;
  75. }
  76. class NaiveMegDNNHandleScope {
  77. int m_orig_level;
  78. public:
  79. NaiveMegDNNHandleScope()
  80. : m_orig_level{MegDNNHandle::exchange_default_dbg_level(2)} {
  81. CompNode::finalize();
  82. }
  83. ~NaiveMegDNNHandleScope() {
  84. auto set = MegDNNHandle::exchange_default_dbg_level(m_orig_level);
  85. mgb_assert(set == 2);
  86. CompNode::finalize();
  87. }
  88. };
  89. #if MGB_CUDA
  90. //! this function is only used in TestGoptInference.EnableCHWN4...
  91. void warp_perspective_mat_gen(HostTensorND& mat, size_t N, size_t INP_H, size_t INP_W) {
  92. static std::mt19937 rng(next_rand_seed());
  93. auto rand_real = [&](double lo, double hi) {
  94. return rng() / (std::mt19937::max() + 1.0) * (hi - lo) + lo;
  95. };
  96. auto rand_real2 = [&](double range) { return rand_real(-range, range); };
  97. auto ptr = mat.ptr<float>();
  98. for (size_t i = 0; i < N; ++i) {
  99. auto rot = rand_real(0, M_PI * 2), scale = rand_real(0.8, 1.2),
  100. sheer = rand_real(0.9, 1.1), dy = rand_real2(INP_H * 0.5),
  101. dx = rand_real2(INP_W * 0.5), ky = rand_real2(0.1 / INP_H),
  102. kx = rand_real2(0.1 / INP_W), kb = rand_real2(0.1) + 1;
  103. ptr[0] = ptr[4] = cos(rot) * scale;
  104. ptr[1] = -(ptr[3] = sin(rot) * scale);
  105. ptr[3] *= sheer;
  106. ptr[4] *= sheer;
  107. ptr[2] = dx;
  108. ptr[5] = dy;
  109. ptr[6] = kx;
  110. ptr[7] = ky;
  111. ptr[8] = kb;
  112. ptr += 9;
  113. }
  114. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  115. }
  116. #endif
  117. } // namespace
  118. TEST(TestGoptInference, ParamFuseConstEndPoint) {
  119. constexpr size_t SIZE = 23;
  120. HostTensorGenerator<> gen;
  121. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  122. auto graph = ComputingGraph::make();
  123. graph->options().graph_opt_level = 0;
  124. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  125. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  126. p = opr::Host2DeviceCopy::make(*graph, host_p), q = p + x, a = y + 3,
  127. z0 = a + q, z1 = a + 4;
  128. HostTensorND host_z0, host_z1;
  129. SymbolVar z0_1, z1_1;
  130. unpack_vector(
  131. gopt::GraphOptimizer{}
  132. .add_pass<gopt::ParamFusePass>()
  133. .apply({{z1, z0}})
  134. .endpoint_vars(),
  135. z1_1, z0_1);
  136. auto func = graph->compile(
  137. {make_callback_copy(z0_1, host_z0), make_callback_copy(z1_1, host_z1)});
  138. func->to_json()->writeto_fpath(
  139. output_file("TestGoptInference.ParamFuseEndPoint.json"));
  140. func->execute();
  141. int nr_opr = 0;
  142. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  143. ++nr_opr;
  144. return true;
  145. });
  146. ASSERT_EQ(8, nr_opr);
  147. auto px = host_x->ptr<float>(), pz0 = host_z0.ptr<float>();
  148. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0],
  149. pz1 = host_z1.ptr<float>()[0];
  150. for (size_t i = 0; i < SIZE; ++i) {
  151. MGB_ASSERT_FLOAT_EQ(px[i] + yv + 3 + pv, pz0[i]);
  152. }
  153. MGB_ASSERT_FLOAT_EQ(yv + 7, pz1);
  154. }
  155. TEST(TestGoptInference, ParamFuse) {
  156. constexpr size_t SIZE = 23;
  157. HostTensorGenerator<> gen;
  158. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  159. auto graph = ComputingGraph::make();
  160. graph->options().graph_opt_level = 0;
  161. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  162. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  163. p = opr::Host2DeviceCopy::make(*graph, host_p),
  164. z = x + y, // endpoint
  165. q = x * y + p; // middle point
  166. SymbolVar z1, q1;
  167. unpack_vector(
  168. gopt::GraphOptimizer{}
  169. .add_pass<gopt::ParamFusePass>()
  170. .apply({{z, q}})
  171. .endpoint_vars(),
  172. z1, q1);
  173. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  174. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  175. ASSERT_EQ(
  176. q1.node()->owner_opr()->dyn_typeinfo(),
  177. q.node()->owner_opr()->dyn_typeinfo());
  178. HostTensorND host_z, host_q;
  179. auto func = graph->compile(
  180. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  181. func->execute();
  182. int nr_opr = 0;
  183. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  184. ++nr_opr;
  185. return true;
  186. });
  187. ASSERT_EQ(6, nr_opr);
  188. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(), pq = host_q.ptr<float>();
  189. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  190. for (size_t i = 0; i < SIZE; ++i) {
  191. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  192. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  193. }
  194. }
  195. TEST(TestGoptInference, ParamFuseMultiDeviceTensorHolder) {
  196. constexpr size_t SIZE = 23;
  197. HostTensorGenerator<> gen;
  198. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  199. auto graph = ComputingGraph::make();
  200. graph->options().graph_opt_level = 0;
  201. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  202. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  203. p = opr::Host2DeviceCopy::make(*graph, host_p),
  204. z = x + y, //! endpoint
  205. q = x * y + p; //! middle point
  206. SymbolVar z1, q1;
  207. unpack_vector(
  208. gopt::GraphOptimizer{}
  209. .add_pass<gopt::ParamMergePass>()
  210. .apply({{z}})
  211. .endpoint_vars(),
  212. z1);
  213. ASSERT_TRUE(z1.node()
  214. ->owner_opr()
  215. ->input(0)
  216. ->owner_opr()
  217. ->same_type<opr::MultipleDeviceTensorHolder>());
  218. unpack_vector(
  219. gopt::GraphOptimizer{}
  220. .add_pass<gopt::ParamMergePass>()
  221. .add_pass<gopt::ParamFusePass>()
  222. .apply({{z, q}})
  223. .endpoint_vars(),
  224. z1, q1);
  225. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  226. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  227. ASSERT_EQ(
  228. q1.node()->owner_opr()->dyn_typeinfo(),
  229. q.node()->owner_opr()->dyn_typeinfo());
  230. HostTensorND host_z, host_q;
  231. auto func = graph->compile(
  232. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  233. func->execute();
  234. int nr_opr = 0;
  235. func->iter_opr_seq([&](cg::OperatorNodeBase* op) {
  236. ++nr_opr;
  237. return true;
  238. });
  239. ASSERT_EQ(6, nr_opr);
  240. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(), pq = host_q.ptr<float>();
  241. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  242. for (size_t i = 0; i < SIZE; ++i) {
  243. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  244. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  245. }
  246. }
  247. TEST(TestGoptInference, ParamFuseMultiRead) {
  248. HostTensorGenerator<> gen;
  249. auto graph = ComputingGraph::make();
  250. graph->options().graph_opt_level = 0;
  251. auto mkvar = [&](const char* name, const TensorShape& shp) {
  252. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  253. };
  254. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  255. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  256. };
  257. auto x = mkvar("x", {23}), p0 = mkcvar("p0", {1}), p1 = mkcvar("p1", {1}),
  258. z0 = x * (p0 + p1) + x / (p0 + p1);
  259. SymbolVar z1;
  260. unpack_vector(
  261. gopt::GraphOptimizer{}
  262. .add_pass<gopt::ParamFusePass>()
  263. .apply({{z0}})
  264. .endpoint_vars(),
  265. z1);
  266. ASSERT_NE(z0.node(), z1.node());
  267. ASSERT_TRUE(z1.node()
  268. ->owner_opr()
  269. ->input(0)
  270. ->owner_opr()
  271. ->input(1)
  272. ->owner_opr()
  273. ->same_type<opr::SharedDeviceTensor>());
  274. ASSERT_TRUE(z1.node()
  275. ->owner_opr()
  276. ->input(1)
  277. ->owner_opr()
  278. ->input(1)
  279. ->owner_opr()
  280. ->same_type<opr::SharedDeviceTensor>());
  281. HostTensorND host_z0, host_z1;
  282. graph->compile({make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)})
  283. ->execute();
  284. MGB_ASSERT_TENSOR_EQ(host_z0, host_z1);
  285. }
  286. TEST(TestGoptInference, ParamFuseStaticInfer) {
  287. HostTensorGenerator<> gen;
  288. auto graph = ComputingGraph::make();
  289. auto mkvar = [&](const char* name, const TensorShape& shp) {
  290. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  291. };
  292. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  293. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  294. };
  295. auto a = mkvar("x", {4}),
  296. b = a.reshape(opr::GetVarShape::make(mkcvar("tshp", {2, 2})));
  297. SymbolVar b1;
  298. unpack_vector(
  299. gopt::GraphOptimizer{}
  300. .add_pass<gopt::ParamFusePass>()
  301. .apply({{b}})
  302. .endpoint_vars(),
  303. b1);
  304. ASSERT_EQ(b1, a.reshape({2, 2}));
  305. }
  306. TEST(TestGoptInference, ParamRedistributeConvMul) {
  307. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  308. HostTensorGenerator<> gen;
  309. auto host_x = gen({N, IC, IH, IW}), host_k = gen({IC}),
  310. host_w = gen({OC, IC, KH, KW});
  311. auto graph = ComputingGraph::make();
  312. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  313. k = opr::Dimshuffle::make(
  314. opr::SharedDeviceTensor::make(*graph, *host_k), {-1, 0, -1, -1}),
  315. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  316. y0 = opr::Convolution::make(x * k, w);
  317. SymbolVar y1;
  318. unpack_vector(
  319. gopt::GraphOptimizer{}
  320. .add_pass<gopt::ParamRedistributePass>()
  321. .apply({{y0}})
  322. .endpoint_vars(),
  323. y1);
  324. ASSERT_NE(y0.node(), y1.node());
  325. HostTensorND host_y0, host_y1;
  326. auto func = graph->compile(
  327. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  328. func->execute();
  329. MGB_ASSERT_TENSOR_EQ(host_y0, host_y1);
  330. }
  331. TEST(TestGoptInference, ParamRedistributeConvMulUniqReader) {
  332. constexpr size_t N = 4, C = 3, IH = 5, IW = 4, KH = 1, KW = 1;
  333. HostTensorGenerator<> gen;
  334. auto host_x = gen({N, C, IH, IW}), host_k = gen({C}), host_w = gen({C, C, KH, KW});
  335. auto graph = ComputingGraph::make();
  336. graph->options().graph_opt_level = 0;
  337. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  338. k = opr::Dimshuffle::make(
  339. opr::SharedDeviceTensor::make(*graph, *host_k) + 2, {-1, 0, -1, -1}),
  340. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  341. // y0 should be replaced
  342. y0 = opr::powf(opr::Convolution::make(x * k, w).rename("y0") + 2, 2),
  343. y0k = (y0 * k).rename("y0k"),
  344. // y0k is accessed twice, so it should not be replaced
  345. y1 = opr::Convolution::make(y0k, w).rename("y1"), z0 = y1 / y0k;
  346. SymbolVar z1;
  347. unpack_vector(
  348. gopt::GraphOptimizer{}
  349. .add_pass<gopt::ParamRedistributePass>()
  350. .apply({{z0}})
  351. .endpoint_vars(),
  352. z1);
  353. ASSERT_NE(z0.node(), z1.node());
  354. auto y1_repl = z1.node()->owner_opr()->input(0)->owner_opr();
  355. ASSERT_TRUE(y1_repl->same_type<opr::Convolution>());
  356. ASSERT_EQ(y1_repl->input(0), z1.node()->owner_opr()->input(1));
  357. HostTensorND host_z0, host_z1;
  358. auto func = graph->compile(
  359. {make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)});
  360. func->execute();
  361. MGB_ASSERT_TENSOR_NEAR(host_z0, host_z1, 5e-5);
  362. }
  363. TEST(TestGoptInference, ParamRedistributeMulConvMul) {
  364. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  365. HostTensorGenerator<> gen;
  366. auto host_x = gen({N, IC, IH, IW}), host_k1 = gen({IC}),
  367. host_k2 = gen({1, OC, 1, 1}), host_w = gen({OC, IC, KH, KW});
  368. auto graph = ComputingGraph::make();
  369. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  370. k1 = opr::Dimshuffle::make(
  371. opr::SharedDeviceTensor::make(*graph, *host_k1), {-1, 0, -1, -1}),
  372. k2 = opr::SharedDeviceTensor::make(*graph, *host_k2),
  373. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  374. y0 = opr::Convolution::make(x * k1, w) * k2;
  375. SymbolVar y1;
  376. unpack_vector(
  377. gopt::GraphOptimizer{}
  378. .add_pass<gopt::ParamRedistributePass>()
  379. .add_pass<gopt::ParamFusePass>()
  380. .apply({{y0}})
  381. .endpoint_vars(),
  382. y1);
  383. auto y1opr = y1.node()->owner_opr();
  384. ASSERT_TRUE(y1opr->same_type<opr::Convolution>());
  385. ASSERT_EQ(y1opr->input(0), x.node());
  386. HostTensorND host_y0, host_y1;
  387. auto func = graph->compile(
  388. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  389. func->execute();
  390. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 5e-6);
  391. }
  392. TEST(TestGoptInference, ParamRedistributeConvAdd) {
  393. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  394. HostTensorGenerator<> gen;
  395. auto host_x = gen({N, IC, IH, IW}), host_b = gen({IC}),
  396. host_w = gen({OC, IC, KH, KW});
  397. auto graph = ComputingGraph::make();
  398. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  399. b = opr::Dimshuffle::make(
  400. opr::SharedDeviceTensor::make(*graph, *host_b), {-1, 0, -1, -1}),
  401. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  402. y0 = opr::Convolution::make(x + b, w);
  403. SymbolVar y1;
  404. unpack_vector(
  405. gopt::GraphOptimizer{}
  406. .add_pass<gopt::ParamRedistributePass>()
  407. .add_pass<gopt::ParamFusePass>()
  408. .apply({{y0}})
  409. .endpoint_vars(),
  410. y1);
  411. ASSERT_NE(y0.node(), y1.node());
  412. HostTensorND host_y0, host_y1;
  413. auto func = graph->compile(
  414. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  415. func->execute();
  416. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  417. }
  418. TEST(TestGoptInference, ParamRedistributeDistThenReasso) {
  419. constexpr size_t N = 4, IC0 = 3, IC1 = 6, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  420. HostTensorGenerator<> gen;
  421. auto graph = ComputingGraph::make();
  422. auto mkvar = [&](const char* name, const TensorShape& shp) {
  423. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  424. };
  425. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  426. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  427. };
  428. auto x0 = mkvar("x0", {N, IC0, IH, IW}), x1 = mkvar("x1", {N, IC1, IH, IW}),
  429. k0 = opr::Dimshuffle::make(mkcvar("x1_", {IC0}), {-1, 0, -1, -1}).rename("x1"),
  430. w0 = mkcvar("w0", {OC, IC0, KH, KW}), k1 = mkcvar("k1", {1, IC1, 1, 1}),
  431. w1 = mkcvar("w1", {OC, IC1, KH, KW}), b0 = mkvar("b0", {1, OC, 1, 1}),
  432. b1 = mkcvar("b1", {1}), k2 = mkcvar("k2", {1}),
  433. y0 = (opr::Convolution::make(x0 * k0, w0) +
  434. opr::Convolution::make(x1 + k1, w1) + b0 + b1) *
  435. k2;
  436. SymbolVar y1;
  437. unpack_vector(
  438. gopt::GraphOptimizer{}
  439. .add_pass<gopt::ParamRedistributePass>()
  440. .add_pass<gopt::ReorderArithChainPass>(
  441. gopt::ConstVarType::IMMUTABLE_AND_PARAM)
  442. .add_pass<gopt::ParamFusePass>()
  443. .apply({{y0}})
  444. .endpoint_vars(),
  445. y1);
  446. ASSERT_NE(y0.node(), y1.node());
  447. HostTensorND host_y0, host_y1;
  448. auto func = graph->compile(
  449. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  450. func->execute();
  451. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  452. auto chain = gopt::extract_opr_leaves(y1.node(), [](cg::OperatorNodeBase* opr) {
  453. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  454. });
  455. size_t nr_conv = 0;
  456. for (auto i : chain) {
  457. auto opr = i->owner_opr();
  458. if (opr->same_type<opr::Convolution>()) {
  459. ++nr_conv;
  460. ASSERT_TRUE(opr->input(0)->owner_opr()->same_type<opr::Host2DeviceCopy>());
  461. ASSERT_TRUE(
  462. opr->input(1)->owner_opr()->same_type<opr::SharedDeviceTensor>());
  463. }
  464. }
  465. ASSERT_EQ(2u, nr_conv);
  466. ASSERT_EQ(4u, chain.size());
  467. }
  468. TEST(TestGoptInference, ParamRedistributeMultiChange) {
  469. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  470. HostTensorGenerator<> gen;
  471. auto graph = ComputingGraph::make();
  472. graph->options().graph_opt_level = 0;
  473. auto mkvar = [&](const char* name, const TensorShape& shp) {
  474. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  475. };
  476. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  477. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  478. };
  479. auto x = mkvar("x", {N, IC, IH, IW}), k0 = mkcvar("k0", {1, IC, 1, 1}),
  480. b0 = mkcvar("b0", {1, IC, 1, 1}), k1 = mkcvar("k0", {1}),
  481. b1 = mkcvar("b0", {1}), w = mkcvar("w", {OC, IC, KH, KW}),
  482. y0 = (opr::Convolution::make(x * k0 + b0, w) + b1) * k1;
  483. SymbolVar y1;
  484. unpack_vector(
  485. gopt::GraphOptimizer{}
  486. .add_pass<gopt::ParamRedistributePass>()
  487. .add_pass<gopt::ParamFusePass>()
  488. .apply({{y0}})
  489. .endpoint_vars(),
  490. y1);
  491. ASSERT_NE(y0.node(), y1.node());
  492. HostTensorND host_y0, host_y1;
  493. auto func = graph->compile(
  494. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  495. func->execute();
  496. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  497. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  498. ASSERT_TRUE(y1elem);
  499. auto yconv = y1elem->input(0)->owner_opr();
  500. if (!yconv->same_type<opr::Convolution>())
  501. yconv = y1elem->input(1)->owner_opr();
  502. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  503. ASSERT_EQ(x.node(), yconv->input(0));
  504. }
  505. TEST(TestGoptInference, ParamRedistributeMultiReader) {
  506. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  507. HostTensorGenerator<> gen;
  508. auto graph = ComputingGraph::make();
  509. graph->options().graph_opt_level = 0;
  510. auto mkvar = [&](const char* name, const TensorShape& shp) {
  511. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  512. };
  513. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  514. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  515. };
  516. auto x = mkvar("x", {N, IC, IH, IW}), k = mkcvar("k", {1, OC, 1, 1}),
  517. w = mkcvar("w", {OC, IC, KH, KW});
  518. auto conv = opr::Convolution::make(x, w);
  519. auto t = conv * k;
  520. auto y0 = t * 4.2f + t * 2.4f;
  521. SymbolVar y1;
  522. unpack_vector(
  523. gopt::GraphOptimizer{}
  524. .add_pass<gopt::ParamRedistributePass>()
  525. .add_pass<gopt::ParamFusePass>()
  526. .apply({{y0}})
  527. .endpoint_vars(),
  528. y1);
  529. ASSERT_NE(y0.node(), y1.node());
  530. HostTensorND host_y0, host_y1;
  531. auto func = graph->compile(
  532. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  533. func->execute();
  534. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  535. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  536. ASSERT_TRUE(y1elem);
  537. auto ymul0 = gopt::as_elem_opr(y1elem->input(0), opr::Elemwise::Mode::MUL),
  538. ymul1 = gopt::as_elem_opr(y1elem->input(1), opr::Elemwise::Mode::MUL);
  539. ASSERT_TRUE(ymul0);
  540. ASSERT_TRUE(ymul1);
  541. auto yconv = ymul0->input(0)->owner_opr();
  542. if (!yconv->same_type<opr::Convolution>()) {
  543. yconv = ymul0->input(1)->owner_opr();
  544. }
  545. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  546. if (ymul1->input(0) != yconv->output(0)) {
  547. ASSERT_EQ(yconv->output(0), ymul1->input(1));
  548. }
  549. ASSERT_EQ(x.node(), yconv->input(0));
  550. }
  551. TEST(TestGoptInference, ParamFuseBiasMerge) {
  552. HostTensorGenerator<> gen;
  553. auto graph = ComputingGraph::make();
  554. graph->options().graph_opt_level = 0;
  555. auto mkvar = [&](const char* name, const TensorShape& shp) {
  556. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  557. };
  558. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  559. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  560. };
  561. auto x = mkvar("x", {6, 3, 8, 8}), w1 = mkcvar("w1", {4, 3, 3, 3}),
  562. w2 = mkcvar("w2", {4, 3, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  563. b2 = mkcvar("b2", {1, 4, 1, 1}), y1 = opr::Convolution::make(x, w1) + b1,
  564. y2 = opr::Convolution::make(x, w2) + b2, y = y1 + y2;
  565. SymbolVar y_opt;
  566. unpack_vector(gopt::optimize_for_inference({y}), y_opt);
  567. HostTensorND host_y, host_y_opt;
  568. auto func = graph->compile(
  569. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  570. func->execute();
  571. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  572. graph->compile({{y_opt, {}}})
  573. ->to_json()
  574. ->writeto_fpath(output_file("TestGoptInference.ParamFuseConvMerge.json"));
  575. auto chain = gopt::extract_opr_leaves(y_opt.node(), [](cg::OperatorNodeBase* opr) {
  576. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  577. });
  578. ASSERT_EQ(3u, chain.size());
  579. }
  580. TEST(TestGoptInference, Float16IOFloat32Compute) {
  581. constexpr size_t INP_H = 10, INP_W = 10;
  582. HostTensorGenerator<> gen;
  583. auto graph = ComputingGraph::make();
  584. auto mkvar = [&](const char* name, const TensorShape& shp) {
  585. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  586. };
  587. graph->options().graph_opt_level = 0;
  588. auto a = mkvar("a", {1, 4, INP_H, INP_W}), s0 = mkvar("s0", {20, 3, INP_H, INP_W}),
  589. s1 = mkvar("s1", {4, 3, 1, 1});
  590. auto b = opr::Convolution::make(s0, s1, {}, {});
  591. auto y = a + b;
  592. y = opr::Concat::make({y, -y}, 0);
  593. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  594. SymbolVar y_opt;
  595. auto options = gopt::OptimizeForInferenceOptions{};
  596. options.enable_f16_io_f32_comp();
  597. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  598. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  599. HostTensorND host_y, host_y_opt;
  600. auto func = graph->compile(
  601. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  602. func->execute();
  603. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  604. }
  605. TEST(TestGoptInference, Float16IOFloat32ComputeDeConv) {
  606. constexpr size_t INP_H = 10, INP_W = 10;
  607. HostTensorGenerator<> gen;
  608. auto graph = ComputingGraph::make();
  609. auto mkvar = [&](const char* name, const TensorShape& shp) {
  610. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  611. };
  612. graph->options().graph_opt_level = 0;
  613. auto s0 = mkvar("s0", {5, 5, 3, 3}), s1 = mkvar("s1", {1, 5, INP_H, INP_W});
  614. auto y = opr::ConvolutionBackwardData::make(s0, s1, {}, {});
  615. SymbolVar y_opt;
  616. auto options = gopt::OptimizeForInferenceOptions{};
  617. options.enable_f16_io_f32_comp();
  618. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  619. ASSERT_EQ(
  620. find_opr<opr::ConvolutionBackwardData>(y_opt).param().compute_mode,
  621. opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
  622. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  623. HostTensorND host_y, host_y_opt;
  624. auto func = graph->compile(
  625. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  626. func->execute();
  627. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-2);
  628. }
  629. TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) {
  630. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  631. HostTensorGenerator<> gen;
  632. auto graph = ComputingGraph::make();
  633. auto mkvar = [&](const char* name, const TensorShape& shp) {
  634. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  635. };
  636. graph->options().graph_opt_level = 0;
  637. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  638. float value1 = M_PI, value2 = 0.6;
  639. auto gen_mat = [&](HostTensorND& mat) {
  640. auto ptr = mat.ptr<float>();
  641. for (size_t i = 0; i < N; ++i) {
  642. auto rot = value1, scale = value2, sheer = value1, dy = value2, dx = value2,
  643. ky = value2, kx = value2, kb = value2;
  644. ptr[0] = ptr[4] = cos(rot) * scale;
  645. ptr[1] = -(ptr[3] = sin(rot) * scale);
  646. ptr[3] *= sheer;
  647. ptr[4] *= sheer;
  648. ptr[2] = dx;
  649. ptr[5] = dy;
  650. ptr[6] = kx;
  651. ptr[7] = ky;
  652. ptr[8] = kb;
  653. ptr += 9;
  654. }
  655. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  656. };
  657. auto mat_host = std::make_shared<HostTensorND>(
  658. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  659. gen_mat(*mat_host);
  660. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  661. TensorShape out_shp{20, 20};
  662. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  663. SymbolVar y_opt;
  664. auto options = gopt::OptimizeForInferenceOptions{};
  665. options.enable_f16_io_f32_comp();
  666. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  667. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  668. HostTensorND host_y, host_y_opt;
  669. auto func = graph->compile(
  670. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  671. func->execute();
  672. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  673. }
  674. TEST(TestGoptInference, Float16IOFloat32ComputeRemap) {
  675. auto cn = CompNode::load("cpu1");
  676. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  677. HostTensorGenerator<> gen;
  678. auto graph = ComputingGraph::make();
  679. auto mkvar = [&](const char* name, const TensorShape& shp) {
  680. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  681. };
  682. graph->options().graph_opt_level = 0;
  683. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  684. auto gen_map = [&](HostTensorND& mat) {
  685. auto ptr = mat.ptr<float>();
  686. for (size_t n = 0; n < N; ++n) {
  687. for (int h = 0; h < 5; ++h) {
  688. for (int w = 0; w < 5; ++w) {
  689. *ptr++ = (h * 5 * 2) + 5 * 2 + 0;
  690. *ptr++ = (h * 5 * 2) + 5 * 2 + 1;
  691. }
  692. }
  693. }
  694. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  695. };
  696. auto map_host = std::make_shared<HostTensorND>(
  697. a.node()->comp_node(), TensorShape{N, 5, 5, 2}, dtype::Float32());
  698. gen_map(*map_host);
  699. auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map");
  700. auto y = opr::Remap::make(a, map);
  701. SymbolVar y_opt;
  702. auto options = gopt::OptimizeForInferenceOptions{};
  703. options.enable_f16_io_f32_comp();
  704. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  705. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  706. HostTensorND host_y, host_y_opt;
  707. auto func = graph->compile(
  708. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  709. func->execute();
  710. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  711. }
  712. TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) {
  713. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  714. HostTensorGenerator<dtype::Uint8> gen_uint8;
  715. auto graph = ComputingGraph::make();
  716. auto mkvar = [&](const char* name, const TensorShape& shp) {
  717. return opr::Host2DeviceCopy::make(*graph, gen_uint8(shp)).rename(name);
  718. };
  719. graph->options().graph_opt_level = 0;
  720. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  721. float value1 = M_PI, value2 = 0.6;
  722. auto gen_mat = [&](HostTensorND& mat) {
  723. auto ptr = mat.ptr<float>();
  724. for (size_t i = 0; i < N; ++i) {
  725. auto rot = value1, scale = value2, sheer = value1, dy = value2, dx = value2,
  726. ky = value2, kx = value2, kb = value2;
  727. ptr[0] = ptr[4] = cos(rot) * scale;
  728. ptr[1] = -(ptr[3] = sin(rot) * scale);
  729. ptr[3] *= sheer;
  730. ptr[4] *= sheer;
  731. ptr[2] = dx;
  732. ptr[5] = dy;
  733. ptr[6] = kx;
  734. ptr[7] = ky;
  735. ptr[8] = kb;
  736. ptr += 9;
  737. }
  738. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  739. };
  740. auto mat_host = std::make_shared<HostTensorND>(
  741. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  742. gen_mat(*mat_host);
  743. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  744. TensorShape out_shp{20, 20};
  745. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  746. SymbolVar y_opt;
  747. auto options = gopt::OptimizeForInferenceOptions{};
  748. options.enable_f16_io_comp();
  749. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  750. ASSERT_EQ(y_opt.dtype(), dtype::Uint8());
  751. HostTensorND host_y, host_y_opt;
  752. auto func = graph->compile(
  753. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  754. func->execute();
  755. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  756. }
  757. TEST(TestGoptInference, Float32TOFloat16) {
  758. CompNode cn = CompNode::load("cpu0");
  759. HostTensorGenerator<> gen(0, 1, 0);
  760. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  761. host_x2 = gen({4, 3, 1, 1}, cn);
  762. auto graph = ComputingGraph::make();
  763. auto make_f32_to_f16_graph = [&]() {
  764. graph->options().graph_opt_level = 0;
  765. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  766. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  767. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  768. auto b = opr::Convolution::make(d1, d2, {}, {});
  769. auto y = d0 + b;
  770. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  771. SymbolVar y_opt;
  772. auto options = gopt::OptimizeForInferenceOptions{};
  773. options.enable_f16_io_comp();
  774. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  775. return y_opt;
  776. };
  777. auto make_f16_graph = [&]() {
  778. auto d0 = opr::TypeCvt::make(
  779. opr::Host2DeviceCopy::make(*graph, host_x0), dtype::Float16{}),
  780. d1 = opr::TypeCvt::make(
  781. opr::Host2DeviceCopy::make(*graph, host_x1), dtype::Float16{}),
  782. d2 = opr::TypeCvt::make(
  783. opr::SharedDeviceTensor::make(*graph, *host_x2), dtype::Float16{});
  784. auto b = opr::Convolution::make(d1, d2, {}, {});
  785. SymbolVar y = d0 + b;
  786. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  787. y = opr::TypeCvt::make(y, dtype::Float32{});
  788. return y;
  789. };
  790. auto y_opt = make_f32_to_f16_graph();
  791. auto y = make_f16_graph();
  792. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  793. ASSERT_EQ(y.dtype(), dtype::Float32{});
  794. HostTensorND host_y_opt, host_y;
  795. auto func = graph->compile(
  796. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  797. func->execute();
  798. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  799. }
  800. TEST(TestGoptInference, Float32TOFloat16C32) {
  801. CompNode cn = CompNode::load("cpu0");
  802. HostTensorGenerator<> gen(0, 1, 0);
  803. auto host_x0 = gen({1, 4, 1, 1}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  804. host_x2 = gen({4, 3, 1, 1}, cn);
  805. auto graph = ComputingGraph::make();
  806. auto make_f32_to_f16_graph = [&]() {
  807. graph->options().graph_opt_level = 0;
  808. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  809. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  810. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  811. auto y = opr::ConvBias::make(d1, d2, d0);
  812. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  813. SymbolVar y_opt;
  814. auto options = gopt::OptimizeForInferenceOptions{};
  815. options.enable_f16_io_f32_comp();
  816. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  817. return y_opt;
  818. };
  819. auto make_f16_graph = [&]() {
  820. auto d0 = opr::TypeCvt::make(
  821. opr::TypeCvt::make(
  822. opr::Host2DeviceCopy::make(*graph, host_x0),
  823. dtype::Float16{}),
  824. dtype::Float32{}),
  825. d1 = opr::TypeCvt::make(
  826. opr::TypeCvt::make(
  827. opr::Host2DeviceCopy::make(*graph, host_x1),
  828. dtype::Float16{}),
  829. dtype::Float32{}),
  830. d2 = opr::TypeCvt::make(
  831. opr::TypeCvt::make(
  832. opr::SharedDeviceTensor::make(*graph, *host_x2),
  833. dtype::Float16{}),
  834. dtype::Float32{});
  835. auto y = opr::ConvBias::make(d1, d2, d0);
  836. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  837. y = opr::TypeCvt::make(
  838. opr::TypeCvt::make(y, dtype::Float16{}), dtype::Float32{});
  839. return y;
  840. };
  841. auto y_opt = make_f32_to_f16_graph();
  842. auto y = make_f16_graph();
  843. ASSERT_EQ(
  844. find_opr<opr::ConvBias>(y_opt).param().compute_mode,
  845. opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
  846. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  847. ASSERT_EQ(y.dtype(), dtype::Float32{});
  848. HostTensorND host_y_opt, host_y;
  849. auto func = graph->compile(
  850. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  851. func->execute();
  852. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  853. }
  854. TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
  855. CompNode cn = CompNode::load("cpu0");
  856. HostTensorGenerator<> gen(0, 1, 0);
  857. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  858. host_x2 = gen({4, 3, 1, 1}, cn);
  859. auto graph = ComputingGraph::make();
  860. auto make_f32_to_f16_graph = [&]() {
  861. graph->options().graph_opt_level = 0;
  862. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  863. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  864. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  865. auto b = opr::Convolution::make(d1, d2, {}, {});
  866. auto y = d0 + b;
  867. SymbolVar y_opt;
  868. auto options = gopt::OptimizeForInferenceOptions{};
  869. options.enable_f16_io_comp();
  870. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  871. return y_opt;
  872. };
  873. auto make_f16_graph = [&]() {
  874. auto d0 = opr::TypeCvt::make(
  875. opr::Host2DeviceCopy::make(*graph, host_x0), dtype::Float16{}),
  876. d1 = opr::TypeCvt::make(
  877. opr::Host2DeviceCopy::make(*graph, host_x1), dtype::Float16{}),
  878. d2 = opr::TypeCvt::make(
  879. opr::SharedDeviceTensor::make(*graph, *host_x2), dtype::Float16{});
  880. auto b = opr::Convolution::make(d1, d2, {}, {});
  881. SymbolVar y = d0 + b;
  882. y = opr::TypeCvt::make(y, dtype::Float32{});
  883. return y;
  884. };
  885. auto y_opt = make_f32_to_f16_graph();
  886. auto y = make_f16_graph();
  887. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  888. ASSERT_EQ(y.dtype(), dtype::Float32{});
  889. HostTensorND host_y_opt, host_y;
  890. auto func = graph->compile(
  891. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  892. func->execute();
  893. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  894. }
  895. TEST(TestGoptInference, Float32TOFloat16Linspace) {
  896. CompNode cn = CompNode::load("cpu0");
  897. HostTensorGenerator<> gen(0, 1, 0);
  898. auto host_x = gen({3, 1}, cn);
  899. auto graph = ComputingGraph::make();
  900. auto make_f32_to_f16_graph = [&]() {
  901. graph->options().graph_opt_level = 0;
  902. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  903. auto xshp = opr::GetVarShape::make(x);
  904. auto cv = [&x](int v) { return x.make_scalar(v); };
  905. auto sub = [&xshp, &cv](int idx) {
  906. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  907. };
  908. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  909. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  910. auto y = opr::Reshape::make(lin, shp);
  911. auto mm = opr::MatrixMul::make(x, y);
  912. SymbolVar mm_opt;
  913. auto options = gopt::OptimizeForInferenceOptions{};
  914. options.enable_f16_io_comp();
  915. unpack_vector(gopt::optimize_for_inference({mm}, options), mm_opt);
  916. return mm_opt;
  917. };
  918. auto make_f16_graph = [&]() {
  919. auto x = opr::TypeCvt::make(
  920. opr::Host2DeviceCopy::make(*graph, host_x), dtype::Float16());
  921. auto xshp = opr::GetVarShape::make(x);
  922. auto cv = [&x](int v) { return x.make_scalar(v); };
  923. auto sub = [&xshp, &cv](int idx) {
  924. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  925. };
  926. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  927. lin = opr::TypeCvt::make(lin, dtype::Float16());
  928. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  929. auto y = opr::Reshape::make(lin, shp);
  930. auto mm = opr::MatrixMul::make(x, y);
  931. mm = opr::TypeCvt::make(mm, dtype::Float32{});
  932. return mm;
  933. };
  934. auto y_opt = make_f32_to_f16_graph();
  935. auto y = make_f16_graph();
  936. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  937. ASSERT_EQ(y.dtype(), dtype::Float32{});
  938. HostTensorND host_y_opt, host_y;
  939. auto func = graph->compile(
  940. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  941. func->execute();
  942. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  943. }
  944. TEST(TestGoptInference, Float32TOFloat16Endpoints) {
  945. HostTensorGenerator<> gen;
  946. auto graph = ComputingGraph::make();
  947. auto mkvar = [&](const char* name, const TensorShape& shp) {
  948. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  949. };
  950. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  951. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  952. };
  953. graph->options().graph_opt_level = 0;
  954. opr::Convolution::Param param;
  955. param.pad_h = param.pad_w = 0;
  956. auto x = mkvar("x", {8, 8, 8, 8}), y = mkvar("y", {8, 8, 8, 8}),
  957. w = mkcvar("w", {4, 8, 3, 3}), z = opr::Convolution::make(x + y, w, param);
  958. auto options = gopt::OptimizeForInferenceOptions{};
  959. options.enable_f16_io_f32_comp();
  960. SymbolVarArray out = gopt::optimize_for_inference({x + y, z}, options);
  961. ASSERT_EQ(out[0].dtype(), dtype::Float32());
  962. ASSERT_EQ(out[1].dtype(), dtype::Float32());
  963. ASSERT_EQ(out[0].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  964. ASSERT_EQ(out[1].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  965. }
  966. TEST(TestGoptInference, ConvertFormatNHWCD4) {
  967. // hwcd4 is only supported in naive handle
  968. NaiveMegDNNHandleScope naive_megdnn_handle;
  969. HostTensorGenerator<> gen;
  970. auto cn = CompNode::load("cpu0");
  971. auto graph = ComputingGraph::make();
  972. graph->options().graph_opt_level = 0;
  973. auto mkvar = [&](const char* name, const TensorShape& shp) {
  974. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  975. };
  976. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  977. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  978. };
  979. auto host_x = gen({8, 8, 8, 8}, cn);
  980. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  981. opr::Convolution::Param param;
  982. param.pad_h = param.pad_w = 0;
  983. auto w1 = mkcvar("w1", {4, 8, 3, 3}), conv = opr::Convolution::make(x, w1, param);
  984. auto shape_of = opr::GetVarShape::make(conv);
  985. auto subtensor = opr::Subtensor::make(
  986. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  987. 0, x.make_scalar(2), None, x.make_scalar(1))});
  988. opr::Resize::Param param_resize;
  989. param_resize.format = opr::Resize::Param::Format::NCHW;
  990. auto resize = opr::ResizeForward::make(conv, subtensor * 2, param_resize);
  991. auto mat = mkcvar("mat", {8, 3, 3}),
  992. warp = opr::WarpPerspectiveForward::make(
  993. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  994. auto b = mkvar("b", {1, 4, 1, 1}),
  995. elem = opr::Elemwise::make({warp + b}, opr::Elemwise::Param::Mode::RELU);
  996. param.pad_h = param.pad_w = 1;
  997. auto w2 = mkcvar("w2", {4, 4, 3, 3}), y = opr::Convolution::make(elem, w2, param),
  998. z = opr::AxisAddRemove::make(y, {opr::AxisAddRemove::AxisDesc::make_add(0)});
  999. SymbolVar y_opt, z_opt;
  1000. auto options = gopt::OptimizeForInferenceOptions{};
  1001. options.enable_nhwcd4();
  1002. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1003. unpack_vector(gopt::optimize_for_inference({z}, options), z_opt);
  1004. ASSERT_EQ(
  1005. opr::Convolution::Param::Format::NHWCD4,
  1006. find_opr<opr::Convolution>(y_opt).param().format);
  1007. ASSERT_EQ(
  1008. TensorFormat::Type::DEFAULT,
  1009. find_opr<opr::AxisAddRemove>(z_opt).input(0)->format().type());
  1010. ASSERT_EQ(4, find_opr<opr::AxisAddRemove>(z_opt).input(0)->shape().ndim);
  1011. graph->compile({{y_opt, {}}})
  1012. ->to_json()
  1013. ->writeto_fpath(output_file("TestGoptInference.ConvertFormatNHWCD4.json"));
  1014. HostTensorND host_y_opt, host_y;
  1015. auto func = graph->compile(
  1016. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1017. func->execute();
  1018. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1019. *host_x = *gen({8, 8, 16, 16}, cn);
  1020. func->execute();
  1021. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1022. }
  1023. #if MGB_OPENCL
  1024. #include "megcore_opencl.h"
  1025. #define REQUIRE_OPENCL() \
  1026. do { \
  1027. if (!CompNode::get_device_count(CompNode::DeviceType::OPENCL)) { \
  1028. return; \
  1029. } \
  1030. } while (0)
  1031. TEST(TestGoptInference, ConvertFormatNHWCD4OpenCL) {
  1032. REQUIRE_OPENCL();
  1033. HostTensorGenerator<> gen;
  1034. auto cn = CompNode::load("openclx");
  1035. auto graph = ComputingGraph::make();
  1036. graph->options().graph_opt_level = 0;
  1037. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1038. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1039. };
  1040. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1041. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1042. };
  1043. auto host_x = gen({8, 8, 8, 8}, cn);
  1044. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1045. opr::Convolution::Param param;
  1046. param.pad_h = param.pad_w = 0;
  1047. auto w1 = mkcvar("w1", {4, 8, 3, 3}), conv = opr::Convolution::make(x, w1, param);
  1048. auto shape_of = opr::GetVarShape::make(conv);
  1049. auto subtensor = opr::Subtensor::make(
  1050. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  1051. 0, x.make_scalar(2), None, x.make_scalar(1))});
  1052. opr::Resize::Param param_resize;
  1053. param_resize.format = opr::Resize::Param::Format::NCHW;
  1054. auto resize = opr::ResizeForward::make(conv, subtensor * 2, param_resize);
  1055. auto mat = mkcvar("mat", {8, 3, 3}),
  1056. warp = opr::WarpPerspectiveForward::make(
  1057. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  1058. auto b = mkvar("b", {1, 4, 1, 1}),
  1059. elem = opr::Elemwise::make({warp + b}, opr::Elemwise::Param::Mode::RELU);
  1060. param.pad_h = param.pad_w = 1;
  1061. auto w2 = mkcvar("w2", {4, 4, 3, 3}), y = opr::Convolution::make(elem, w2, param),
  1062. z = opr::AxisAddRemove::make(y, {opr::AxisAddRemove::AxisDesc::make_add(0)});
  1063. SymbolVar y_opt, z_opt;
  1064. auto options = gopt::OptimizeForInferenceOptions{};
  1065. options.enable_nhwcd4();
  1066. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1067. unpack_vector(gopt::optimize_for_inference({z}, options), z_opt);
  1068. ASSERT_EQ(
  1069. opr::Convolution::Param::Format::NHWCD4,
  1070. find_opr<opr::Convolution>(y_opt).param().format);
  1071. ASSERT_EQ(
  1072. TensorFormat::Type::DEFAULT,
  1073. find_opr<opr::AxisAddRemove>(z_opt).input(0)->format().type());
  1074. ASSERT_EQ(4, find_opr<opr::AxisAddRemove>(z_opt).input(0)->shape().ndim);
  1075. HostTensorND host_y_opt, host_y;
  1076. auto func = graph->compile(
  1077. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1078. func->execute();
  1079. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1080. *host_x = *gen({8, 8, 16, 16}, cn);
  1081. func->execute();
  1082. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1083. }
  1084. #undef REQUIRE_OPENCL
  1085. #endif
  1086. TEST(TestGoptInference, ConvertFormatNHWCD4Elemwise) {
  1087. // hwcd4 is only supported in naive handle
  1088. NaiveMegDNNHandleScope naive_megdnn_handle;
  1089. HostTensorGenerator<> gen;
  1090. auto cn = CompNode::load("cpu0");
  1091. auto graph = ComputingGraph::make();
  1092. graph->options().graph_opt_level = 0;
  1093. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1094. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1095. };
  1096. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1097. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1098. };
  1099. auto host_x = gen({8, 8, 8, 8}, cn);
  1100. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1101. opr::Convolution::Param param;
  1102. param.pad_h = param.pad_w = 0;
  1103. auto w1 = mkcvar("w1", {8, 8, 3, 3}), conv = opr::Convolution::make(x, w1, param);
  1104. auto b = mkvar("b", {1, 1, 1, 1}),
  1105. elem = opr::Elemwise::make({conv + b}, opr::Elemwise::Param::Mode::RELU);
  1106. param.pad_h = param.pad_w = 1;
  1107. auto w2 = mkcvar("w2", {8, 8, 3, 3}),
  1108. conv2 = opr::Convolution::make(elem, w2, param);
  1109. auto b_scaler = mkvar("b", {1}), elem2 = conv2 + b_scaler;
  1110. param.pad_h = param.pad_w = 1;
  1111. auto w3 = mkcvar("w2", {8, 8, 3, 3}), y = opr::Convolution::make(elem2, w3, param);
  1112. SymbolVar y_opt;
  1113. auto options = gopt::OptimizeForInferenceOptions{};
  1114. options.enable_nhwcd4();
  1115. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1116. ASSERT_EQ(
  1117. opr::Convolution::Param::Format::NHWCD4,
  1118. find_opr<opr::Convolution>(y_opt).param().format);
  1119. graph->compile({{y_opt, {}}})
  1120. ->to_json()
  1121. ->writeto_fpath(
  1122. output_file("TestGoptInference.ConvertFormatNHWCD4Elemwise.json"));
  1123. HostTensorND host_y_opt, host_y;
  1124. auto func = graph->compile(
  1125. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1126. func->execute();
  1127. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1128. *host_x = *gen({8, 8, 16, 16}, cn);
  1129. func->execute();
  1130. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1131. }
  1132. TEST(TestGoptInference, ConvertFormatNHWCD4TypeCvt) {
  1133. NaiveMegDNNHandleScope naive_megdnn_handle;
  1134. HostTensorGenerator<> gen;
  1135. auto cn = CompNode::load("cpu0");
  1136. auto graph = ComputingGraph::make();
  1137. graph->options().graph_opt_level = 0;
  1138. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1139. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1140. };
  1141. auto host_x = gen({8, 8, 8, 8}, cn);
  1142. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1143. opr::Convolution::Param param;
  1144. param.pad_h = param.pad_w = 0;
  1145. auto w1 = mkcvar("w1", {8, 8, 3, 3}), conv1 = opr::Convolution::make(x, w1, param),
  1146. tcvt1 = opr::TypeCvt::make(conv1, dtype::Float16());
  1147. auto w2 = mkcvar("w2", {8, 8, 3, 3}), conv2 = opr::Convolution::make(x, w2, param),
  1148. tcvt2 = opr::TypeCvt::make(conv2, dtype::Float16());
  1149. auto y = opr::Elemwise::make({tcvt1, tcvt2}, opr::Elemwise::Param::Mode::ADD);
  1150. SymbolVar y_opt;
  1151. auto options = gopt::OptimizeForInferenceOptions{};
  1152. options.enable_nhwcd4();
  1153. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1154. ASSERT_EQ(
  1155. opr::Convolution::Param::Format::NHWCD4,
  1156. find_opr<opr::Convolution>(y_opt).param().format);
  1157. graph->compile({{y_opt, {}}})
  1158. ->to_json()
  1159. ->writeto_fpath(
  1160. output_file("TestGoptInference.ConvertFormatNHWCD4TypeCvt.json"));
  1161. HostTensorND host_y_opt, host_y;
  1162. auto func = graph->compile(
  1163. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1164. func->execute();
  1165. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1166. *host_x = *gen({8, 8, 16, 16}, cn);
  1167. func->execute();
  1168. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1169. }
  1170. TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) {
  1171. // hwcd4 is only supported in naive handle
  1172. NaiveMegDNNHandleScope naive_megdnn_handle;
  1173. HostTensorGenerator<> gen;
  1174. auto cn = CompNode::load("cpu0");
  1175. auto graph = ComputingGraph::make();
  1176. graph->options().graph_opt_level = 0;
  1177. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1178. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1179. };
  1180. auto host_x = gen({2, 8, 8, 16}, cn);
  1181. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1182. opr::Convolution::Param param;
  1183. param.pad_h = param.pad_w = 1;
  1184. auto w1 = mkcvar("w1", {4, 8, 3, 3}), conv1 = opr::Convolution::make(x, w1, param);
  1185. auto w2 = mkcvar("w2", {8, 16, 4, 3, 3, 4}),
  1186. local = opr::Local::make(conv1, w2, param);
  1187. auto w3 = mkcvar("w3", {4, 4, 3, 3}),
  1188. conv2 = opr::Convolution::make(local, w3, param);
  1189. opr::GroupLocal::Param param_group_local;
  1190. param_group_local.pad_h = param_group_local.pad_w = 1;
  1191. auto w4 = mkcvar("w4", {2, 8, 16, 2, 3, 3, 2}),
  1192. group_local = opr::GroupLocal::make(conv2, w4, param_group_local);
  1193. auto w5 = mkcvar("w5", {4, 4, 3, 3}),
  1194. y = opr::Convolution::make(group_local, w5, param);
  1195. SymbolVar y_opt;
  1196. auto options = gopt::OptimizeForInferenceOptions{};
  1197. options.enable_nhwcd4();
  1198. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1199. ASSERT_EQ(
  1200. opr::Convolution::Param::Format::NHWCD4,
  1201. find_opr<opr::Convolution>(y_opt).param().format);
  1202. ASSERT_EQ(
  1203. opr::Local::Param::Format::NCHW,
  1204. find_opr<opr::Local>(y_opt).param().format);
  1205. ASSERT_EQ(
  1206. opr::GroupLocal::Param::Format::NCHW,
  1207. find_opr<opr::GroupLocal>(y_opt).param().format);
  1208. graph->compile({{y_opt, {}}})
  1209. ->to_json()
  1210. ->writeto_fpath(
  1211. output_file("TestGoptInference.ConvertFormatNHWCD4LOCAL.json"));
  1212. HostTensorND host_y_opt, host_y;
  1213. auto func = graph->compile(
  1214. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1215. func->execute();
  1216. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1217. }
  1218. TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) {
  1219. // hwcd4 is only supported in naive handle
  1220. NaiveMegDNNHandleScope naive_megdnn_handle;
  1221. HostTensorGenerator<> gen;
  1222. auto cn = CompNode::load("cpu0");
  1223. auto graph = ComputingGraph::make();
  1224. graph->options().graph_opt_level = 0;
  1225. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1226. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1227. };
  1228. auto host_x = gen({8, 8, 8, 8}, cn);
  1229. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1230. opr::Convolution::Param param;
  1231. param.pad_h = param.pad_w = 0;
  1232. auto w0 = mkcvar("w1", {4, 8, 2, 2}), conv = opr::Convolution::make(x, w0, param);
  1233. auto w1 = mkcvar("w1", {4, 1, 2, 2}),
  1234. y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {});
  1235. SymbolVar y_opt;
  1236. auto options = gopt::OptimizeForInferenceOptions{};
  1237. options.enable_nhwcd4();
  1238. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1239. ASSERT_EQ(
  1240. opr::Convolution::Param::Format::NCHW,
  1241. find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
  1242. ASSERT_EQ(
  1243. opr::Convolution::Param::Format::NHWCD4,
  1244. find_opr<opr::Convolution>(y_opt).param().format);
  1245. HostTensorND host_y_opt, host_y;
  1246. auto func = graph->compile(
  1247. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1248. func->execute();
  1249. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1250. }
  1251. TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) {
  1252. // hwcd4 is only supported in naive handle
  1253. NaiveMegDNNHandleScope naive_megdnn_handle;
  1254. HostTensorGenerator<> gen;
  1255. auto cn = CompNode::load("cpu0");
  1256. auto graph = ComputingGraph::make();
  1257. graph->options().graph_opt_level = 0;
  1258. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1259. return opr::TypeCvt::make(
  1260. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  1261. dtype);
  1262. };
  1263. auto host_x = gen({8, 8, 8, 8}, cn);
  1264. auto _x = opr::Host2DeviceCopy::make(*graph, host_x),
  1265. x = opr::TypeCvt::make(_x, dtype::QuantizedS8(0.2f));
  1266. opr::ConvBias::Param param;
  1267. param.pad_h = param.pad_w = 0;
  1268. auto w = mkcvar("w", {4, 8, 3, 3}, dtype::QuantizedS8(0.1f)),
  1269. b = mkcvar("b", {1, 4, 1, 1}, dtype::QuantizedS32(0.02f)),
  1270. y = opr::ConvBias::make(
  1271. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(0.2f)});
  1272. SymbolVar y_opt;
  1273. auto options = gopt::OptimizeForInferenceOptions{};
  1274. options.enable_nhwcd4();
  1275. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1276. ASSERT_EQ(
  1277. opr::ConvBias::Param::Format::NHWCD4,
  1278. find_opr<opr::ConvBias>(y_opt).param().format);
  1279. graph->compile({{y_opt, {}}})
  1280. ->to_json()
  1281. ->writeto_fpath(
  1282. output_file("TestGoptInference.ConvertFormatNHWCD4Qint8.json"));
  1283. auto float_y = opr::TypeCvt::make(y, dtype::Float32()),
  1284. float_y_opt = opr::TypeCvt::make(y_opt, dtype::Float32());
  1285. HostTensorND host_y_opt, host_y;
  1286. auto func = graph->compile(
  1287. {make_callback_copy(float_y, host_y),
  1288. make_callback_copy(float_y_opt, host_y_opt)});
  1289. func->execute();
  1290. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1291. }
  1292. TEST(TestGoptInference, ConvertFormatPadIC) {
  1293. // hwcd4 is only supported in naive handle
  1294. NaiveMegDNNHandleScope naive_megdnn_handle;
  1295. HostTensorGenerator<> gen;
  1296. auto cn = CompNode::load("cpu0");
  1297. auto graph = ComputingGraph::make();
  1298. graph->options().graph_opt_level = 0;
  1299. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1300. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1301. };
  1302. auto host_inp1 = gen({1, 6, 128, 128}, cn), host_inp2 = gen({1, 6, 256, 256}, cn);
  1303. auto inp1 = opr::Host2DeviceCopy::make(*graph, host_inp1),
  1304. inp2 = opr::Host2DeviceCopy::make(*graph, host_inp2);
  1305. auto shape_tmp = mkcvar("tmp", {256, 256});
  1306. auto shape_of = opr::GetVarShape::make(shape_tmp);
  1307. opr::Resize::Param param_resize;
  1308. param_resize.format = opr::Resize::Param::Format::NCHW;
  1309. auto resize = opr::ResizeForward::make(inp1, shape_of, param_resize);
  1310. auto concat = opr::Concat::make({inp2, resize}, 1);
  1311. opr::Convolution::Param param;
  1312. param.pad_h = param.pad_w = 1;
  1313. param.sparse = opr::Convolution::Param::Sparse::DENSE;
  1314. auto w1 = mkcvar("w1", {12, 12, 3, 3});
  1315. auto y = opr::Convolution::make(concat, w1, param);
  1316. SymbolVar y_opt;
  1317. auto options = gopt::OptimizeForInferenceOptions{};
  1318. options.enable_nhwcd4();
  1319. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1320. HostTensorND host_y_opt, host_y;
  1321. auto func = graph->compile(
  1322. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1323. func->execute();
  1324. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1325. }
  1326. TEST(TestGoptInference, concatbypass) {
  1327. // hwcd4 is only supported in naive handle
  1328. NaiveMegDNNHandleScope naive_megdnn_handle;
  1329. HostTensorGenerator<> gen;
  1330. auto cn = CompNode::load("cpu0");
  1331. auto graph = ComputingGraph::make();
  1332. graph->options().graph_opt_level = 0;
  1333. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1334. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1335. };
  1336. auto host_inp1 = gen({1, 6, 16, 16}, cn), host_inp2 = gen({1, 6, 32, 32}, cn);
  1337. auto inp1 = opr::Host2DeviceCopy::make(*graph, host_inp1),
  1338. inp2 = opr::Host2DeviceCopy::make(*graph, host_inp2);
  1339. auto shape_tmp = mkcvar("tmp", {32, 32});
  1340. auto shape_of = opr::GetVarShape::make(shape_tmp);
  1341. opr::Resize::Param param_resize;
  1342. param_resize.format = opr::Resize::Param::Format::NCHW;
  1343. auto resize = opr::ResizeForward::make(inp1, shape_of, param_resize);
  1344. //! this concat should forward to chw
  1345. auto concat = opr::Concat::make({inp2, resize}, 1);
  1346. opr::Convolution::Param param;
  1347. param.pad_h = param.pad_w = 1;
  1348. param.sparse = opr::Convolution::Param::Sparse::DENSE;
  1349. auto w1 = mkcvar("w1", {12, 12, 3, 3});
  1350. auto w2 = mkcvar("w1", {12, 24, 3, 3});
  1351. auto y = opr::Convolution::make(concat, w1, param);
  1352. //! this concat should bypass CD4
  1353. y = opr::Concat::make({y, y}, 0);
  1354. y = opr::Convolution::make(y, w1, param);
  1355. //! this concat should bypass CD4
  1356. y = opr::Concat::make({y, y}, 1);
  1357. y = opr::Convolution::make(y, w2, param);
  1358. //! this concat should bypass CD4
  1359. y = opr::Concat::make({y, y}, 2);
  1360. y = opr::Convolution::make(y, w1, param);
  1361. SymbolVar y_opt;
  1362. auto options = gopt::OptimizeForInferenceOptions{};
  1363. options.enable_nhwcd4();
  1364. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1365. HostTensorND host_y_opt, host_y;
  1366. auto func = graph->compile(
  1367. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1368. size_t relayout_format_nr = 0;
  1369. auto cb = [&](cg::OperatorNodeBase* opr) {
  1370. if (opr->try_cast_final<opr::Convolution>()) {
  1371. auto conv_inputs = opr->input();
  1372. for (auto& input : conv_inputs) {
  1373. if (std::string::npos !=
  1374. std::string(input->cname()).find("relayout_format")) {
  1375. relayout_format_nr++;
  1376. }
  1377. }
  1378. }
  1379. return true;
  1380. };
  1381. func->iter_opr_seq(cb);
  1382. func->execute();
  1383. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1384. ASSERT_EQ(
  1385. opr::Convolution::Param::Format::NHWCD4,
  1386. find_opr<opr::Convolution>(y_opt).param().format);
  1387. ASSERT_EQ(1, relayout_format_nr);
  1388. }
  1389. TEST(TestGoptInference, ConvertBatchNormPass) {
  1390. auto cn = CompNode::load("cpu0");
  1391. std::vector<TensorShape> shps = {{1, 3, 1, 1}, {1, 1, 1, 3}},
  1392. xshps = {{2, 3, 16, 24}, {2, 16, 24, 3}};
  1393. for (int t = 0; t < 2; t++) {
  1394. HostTensorGenerator<> gen(0, 1, 0);
  1395. auto graph = ComputingGraph::make();
  1396. graph->options().graph_opt_level = 0;
  1397. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1398. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1399. };
  1400. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1401. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1402. };
  1403. using Param = opr::BatchNorm::Param;
  1404. Param::ParamDim param_dim =
  1405. t == 0 ? Param::ParamDim::DIM_1C11 : Param::ParamDim::DIM_111C;
  1406. Param param(param_dim, Param::FwdMode::INFERENCE);
  1407. TensorShape shp = shps[t], xshp = xshps[t];
  1408. auto x = mkvar("x", xshp), scale = mkcvar("scale", shp),
  1409. bias = mkcvar("bias", shp), mean = mkcvar("mean", shp);
  1410. auto host_variance = gen(shp, cn);
  1411. for (size_t i = 0; i < shp.total_nr_elems(); ++i) {
  1412. host_variance->ptr<float>()[i] = std::abs(host_variance->ptr<float>()[i]);
  1413. }
  1414. auto variance = opr::SharedDeviceTensor::make(*graph, *host_variance)
  1415. .rename("variance");
  1416. auto y = opr::BatchNorm::make(x, scale, bias, mean, variance, param)[5];
  1417. SymbolVar y_opt;
  1418. unpack_vector(
  1419. gopt::optimize_for_inference({y}, gopt::OptimizeForInferenceOptions{}),
  1420. y_opt);
  1421. ASSERT_EQ(0u, find_opr_num<opr::BatchNorm>(y_opt));
  1422. graph->compile({{y_opt, {}}})
  1423. ->to_json()
  1424. ->writeto_fpath(
  1425. output_file("TestGoptInference.ConvertBatchNormPass.json"));
  1426. HostTensorND host_y, host_y_opt;
  1427. auto func = graph->compile(
  1428. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1429. func->execute();
  1430. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  1431. }
  1432. }
  1433. TEST(TestGoptInference, ConvBiasNonlinearityFusePass) {
  1434. // hwcd4 is only supported in naive handle
  1435. NaiveMegDNNHandleScope naive_megdnn_handle;
  1436. auto cn = CompNode::load("cpu0");
  1437. HostTensorGenerator<> gen;
  1438. auto graph = ComputingGraph::make();
  1439. graph->options().graph_opt_level = 0;
  1440. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1441. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1442. };
  1443. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1444. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1445. };
  1446. opr::Convolution::Param param;
  1447. auto x = mkvar("x", {5, 8, 16, 24}), w1 = mkcvar("w1", {4, 8, 1, 1}),
  1448. w2 = mkcvar("w2", {4, 4, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  1449. b2 = mkcvar("b2", {1, 4, 1, 1}), w3 = mkcvar("w3", {8, 4, 1, 1}),
  1450. y_cut = opr::Convolution::make(x, w1, param),
  1451. y1 = opr::Elemwise::make({y_cut + b1}, opr::Elemwise::Param::Mode::RELU);
  1452. param.pad_w = param.pad_h = 1;
  1453. auto y2 = opr::Elemwise::make(
  1454. {opr::Convolution::make(y1, w2, param) + b2},
  1455. opr::Elemwise::Param::Mode::SIGMOID);
  1456. param.pad_w = param.pad_h = 0;
  1457. auto y3 = opr::Convolution::make(y2, w3, param), y_tmp = y3 + x,
  1458. y_expand = opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU),
  1459. y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp;
  1460. SymbolVar y_opt;
  1461. auto options = gopt::OptimizeForInferenceOptions{};
  1462. options.enable_nhwcd4().enable_fuse_conv_bias_nonlinearity();
  1463. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1464. ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
  1465. graph->compile({{y_opt, {}}})
  1466. ->to_json()
  1467. ->writeto_fpath(
  1468. output_file("TestGoptInference.FuseConvBiasNonlinPass.json"));
  1469. HostTensorND host_y, host_y_opt;
  1470. auto func = graph->compile(
  1471. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1472. func->execute();
  1473. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1474. }
  1475. TEST(TestGoptInference, ConvBiasNonlinearityFusePass_FullBias) {
  1476. NaiveMegDNNHandleScope naive_megdnn_handle;
  1477. for (int i = 0; i < 2; i++) {
  1478. auto graph = ComputingGraph::make();
  1479. auto cn = CompNode::load("cpu0");
  1480. HostTensorGenerator<> gen;
  1481. auto mkImvar = [&](const char* name, const TensorShape& shp) {
  1482. return opr::ImmutableTensor::make(*graph, *gen(shp, cn)).rename(name);
  1483. };
  1484. graph->options().graph_opt_level = 0;
  1485. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1486. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1487. };
  1488. opr::Convolution::Param param;
  1489. auto host_x = gen({1, 8, 16, 24}, cn);
  1490. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1491. w1 = mkcvar("w1", {4, 8, 1, 1}), w2 = mkcvar("w2", {4, 8, 3, 3}),
  1492. w3 = mkcvar("w3", {4, 4, 1, 1}),
  1493. b = i == 0 ? mkcvar("b", {1, 4, 16, 24}) : mkImvar("bias", {1, 4, 16, 24}),
  1494. y_cut0 = opr::Convolution::make(x, w1, param);
  1495. param.pad_w = param.pad_h = 1;
  1496. auto y_cut1 = opr::Convolution::make(x, w2, param);
  1497. auto y1 = opr::Elemwise::make(
  1498. {y_cut0 + y_cut1}, opr::Elemwise::Param::Mode::RELU);
  1499. param.pad_w = param.pad_h = 0;
  1500. auto y2 = opr::Convolution::make(y1, w3, param);
  1501. auto y = opr::Elemwise::make({y2 + b}, opr::Elemwise::Param::Mode::RELU);
  1502. SymbolVar y_opt;
  1503. auto options = gopt::OptimizeForInferenceOptions{};
  1504. options.enable_fuse_conv_bias_nonlinearity();
  1505. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1506. ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
  1507. graph->compile({{y_opt, {}}})
  1508. ->to_json()
  1509. ->writeto_fpath(output_file("TestGoptInference.FuseConvBiasNonlinPass_"
  1510. "FulBias.json"));
  1511. HostTensorND host_y, host_y_opt;
  1512. auto func = graph->compile(
  1513. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  1514. func->execute();
  1515. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1516. *host_x = *gen({4, 8, 16, 24}, cn);
  1517. func->execute();
  1518. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1519. }
  1520. }
  1521. #if (MEGDNN_AARCH64 || MEGDNN_ARMV7) && !MGB_OPENCL && !MGB_CUDA
  1522. TEST(TestGoptInference, FuseTypeCvtAndElemwiseCase0) {
  1523. HostTensorGenerator<dtype::Int16, RandomDistribution::UNIFORM> gen(0, 255);
  1524. auto cn = CompNode::load("cpu0");
  1525. auto graph = ComputingGraph::make();
  1526. graph->options().graph_opt_level = 0;
  1527. size_t n = 1;
  1528. size_t c = 128;
  1529. size_t h = 16;
  1530. size_t w = 16;
  1531. auto host_x1 = gen({n, h, w, c}, cn);
  1532. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  1533. auto x_nchw = opr::Dimshuffle::make(x, {0, 3, 1, 2}, 4, cn);
  1534. auto x_f32 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  1535. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1536. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1537. };
  1538. auto s = mkcvar("s", {1, c, 1, 1});
  1539. auto b = mkcvar("b", {1, c, 1, 1});
  1540. auto result = opr::Elemwise::make(
  1541. {x_f32, s, b}, opr::Elemwise::Param::Mode::FUSE_MUL_ADD3);
  1542. auto y = result;
  1543. SymbolVar y_opt;
  1544. auto options = gopt::OptimizeForInferenceOptions{};
  1545. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1546. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::ElemwiseMultiType>());
  1547. ASSERT_EQ(
  1548. opr::ElemwiseMultiType::Param::Mode::FUSE_MUL_ADD3_INT16xF32xF32xF32,
  1549. find_opr<opr::ElemwiseMultiType>(y_opt).param().mode);
  1550. HostTensorND host_y_opt, host_y;
  1551. auto func = graph->compile({make_callback_copy(y, host_y)});
  1552. func->execute();
  1553. graph->options().graph_opt_level = 2;
  1554. auto func_opt = graph->compile({make_callback_copy(y, host_y_opt)});
  1555. func_opt->execute();
  1556. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  1557. }
  1558. TEST(TestGoptInference, FuseTypeCvtAndElemwiseCase1) {
  1559. HostTensorGenerator<dtype::Int16, RandomDistribution::UNIFORM> gen(0, 255);
  1560. auto cn = CompNode::load("cpu0");
  1561. auto graph = ComputingGraph::make();
  1562. graph->options().graph_opt_level = 0;
  1563. size_t n = 1;
  1564. size_t c = 128;
  1565. size_t h = 16;
  1566. size_t w = 16;
  1567. auto host_x1 = gen({n, h, w, c}, cn);
  1568. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  1569. auto x_nchw = opr::Dimshuffle::make(x, {0, 3, 1, 2}, 4, cn);
  1570. auto x_f32 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  1571. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1572. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1573. };
  1574. auto s = mkcvar("s", {1, c, 1, 1});
  1575. auto result = opr::Elemwise::make({x_f32, s}, opr::Elemwise::Param::Mode::MUL);
  1576. auto y = result;
  1577. SymbolVar y_opt;
  1578. auto options = gopt::OptimizeForInferenceOptions{};
  1579. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1580. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::ElemwiseMultiType>());
  1581. ASSERT_EQ(
  1582. opr::ElemwiseMultiType::Param::Mode::MUL_INT16xF32xF32,
  1583. find_opr<opr::ElemwiseMultiType>(y_opt).param().mode);
  1584. HostTensorND host_y_opt, host_y;
  1585. auto func = graph->compile({make_callback_copy(y, host_y)});
  1586. func->execute();
  1587. graph->options().graph_opt_level = 2;
  1588. auto func_opt = graph->compile({make_callback_copy(y, host_y_opt)});
  1589. func_opt->execute();
  1590. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  1591. }
  1592. TEST(TestGoptInference, FuseTypeCvtAndElemwiseCase2) {
  1593. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  1594. auto cn = CompNode::load("cpu0");
  1595. auto graph = ComputingGraph::make();
  1596. graph->options().graph_opt_level = 0;
  1597. size_t n = 1;
  1598. size_t c = 128;
  1599. size_t h = 16;
  1600. size_t w = 16;
  1601. auto host_x1 = gen({n, h, w, c}, cn);
  1602. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  1603. auto x_nchw = opr::Dimshuffle::make(x, {0, 3, 1, 2}, 4, cn);
  1604. auto x_f32 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  1605. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1606. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  1607. };
  1608. auto s = mkcvar("s", {1, c, 1, 1});
  1609. auto b = mkcvar("b", {1, c, 1, 1});
  1610. auto result = opr::Elemwise::make(
  1611. {x_f32, s, b}, opr::Elemwise::Param::Mode::FUSE_MUL_ADD3);
  1612. auto y = result;
  1613. SymbolVar y_opt;
  1614. auto options = gopt::OptimizeForInferenceOptions{};
  1615. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1616. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::ElemwiseMultiType>());
  1617. ASSERT_EQ(
  1618. opr::ElemwiseMultiType::Param::Mode::FUSE_MUL_ADD3_UINT8xF32xF32xF32,
  1619. find_opr<opr::ElemwiseMultiType>(y_opt).param().mode);
  1620. HostTensorND host_y_opt, host_y;
  1621. auto func = graph->compile({make_callback_copy(y, host_y)});
  1622. func->execute();
  1623. graph->options().graph_opt_level = 2;
  1624. auto func_opt = graph->compile({make_callback_copy(y, host_y_opt)});
  1625. func_opt->execute();
  1626. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  1627. }
  1628. #endif
  1629. TEST(TestGoptInference, ParamMerge) {
  1630. auto cns = load_multiple_xpus(2);
  1631. HostTensorGenerator<> gen;
  1632. auto graph = ComputingGraph::make();
  1633. auto var0 = opr::SharedDeviceTensor::make(*graph, *gen({2, 3}, cns[0])),
  1634. var1 = opr::SharedDeviceTensor::make(*graph, *gen({1, 3}, cns[1])),
  1635. y = var0 + opr::Copy::make(var1, {cns[0]});
  1636. HostTensorND y_expected_val;
  1637. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1638. SymbolVar y_opt;
  1639. unpack_vector(
  1640. gopt::GraphOptimizer{}
  1641. .add_pass<gopt::ParamMergePass>()
  1642. .apply({{y}})
  1643. .endpoint_vars(),
  1644. y_opt);
  1645. auto opr = y_opt.node()->owner_opr();
  1646. ASSERT_EQ(2u, opr->input().size());
  1647. ASSERT_EQ(2u, find_opr<opr::MultipleDeviceTensorHolder>(y_opt).output().size());
  1648. HostTensorND y_got_val;
  1649. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1650. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1651. }
  1652. TEST(TestGoptInference, ParamMergeFormat) {
  1653. auto cns = load_multiple_xpus(2);
  1654. auto make_dv = [](const HostTensorND& hv) {
  1655. TensorLayout layout{
  1656. hv.layout(), hv.layout().dtype,
  1657. megdnn::Image2DPack4TensorFormat::make_raw(1, 64)};
  1658. auto ret = std::make_shared<DeviceTensorND>(hv.comp_node(), layout);
  1659. ret->copy_from_fixlayout(hv).sync();
  1660. return ret;
  1661. };
  1662. HostTensorGenerator<> gen;
  1663. auto graph = ComputingGraph::make();
  1664. auto var0 = opr::SharedDeviceTensorWithFormat::make(
  1665. *graph, make_dv(*gen({2, 32}, cns[0]))),
  1666. var1 = opr::SharedDeviceTensorWithFormat::make(
  1667. *graph, make_dv(*gen({1, 32}, cns[1]))),
  1668. y = var0 + opr::Copy::make(var1, {cns[0]});
  1669. HostTensorND y_expected_val;
  1670. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1671. SymbolVar y_opt;
  1672. unpack_vector(
  1673. gopt::GraphOptimizer{}
  1674. .add_pass<gopt::ParamMergePass>()
  1675. .apply({{y}})
  1676. .endpoint_vars(),
  1677. y_opt);
  1678. auto opr = y_opt.node()->owner_opr();
  1679. ASSERT_EQ(2u, opr->input().size());
  1680. ASSERT_EQ(
  1681. 2u,
  1682. find_opr<opr::MultipleDeviceTensorWithFormatHolder>(y_opt).output().size());
  1683. HostTensorND y_got_val;
  1684. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1685. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1686. }
  1687. #if MGB_ENABLE_FASTRUN
  1688. TEST(TestGoptInference, AlgoProfile) {
  1689. HostTensorGenerator<> gen;
  1690. auto graph = ComputingGraph::make();
  1691. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1692. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1693. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1694. z = opr::Convolution::make(x, y);
  1695. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1696. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1697. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1698. gopt::enable_opr_algo_profiling_inplace({z + 2.3f});
  1699. ASSERT_EQ(S::PROFILE, conv.execution_policy().strategy);
  1700. }
  1701. #endif
  1702. TEST(TestGoptInference, ProfileCache) {
  1703. HostTensorGenerator<> gen;
  1704. auto graph = ComputingGraph::make();
  1705. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1706. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1707. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1708. z = opr::Convolution::make(x, y);
  1709. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1710. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1711. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1712. gopt::enable_opr_use_profiling_cache_inplace({z + 2.3f});
  1713. ASSERT_EQ(S::PROFILE | S::HEURISTIC, conv.execution_policy().strategy);
  1714. }
  1715. TEST(TestGoptInference, FastProfileCache) {
  1716. HostTensorGenerator<> gen;
  1717. auto graph = ComputingGraph::make();
  1718. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1719. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1720. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1721. z = opr::Convolution::make(x, y);
  1722. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1723. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1724. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1725. gopt::modify_opr_algo_strategy_inplace({z + 2.3f}, S::PROFILE | S::OPTIMIZED);
  1726. ASSERT_EQ(S::PROFILE | S::OPTIMIZED, conv.execution_policy().strategy);
  1727. }
  1728. TEST(TestGoptInference, AlgoWorkspaceLimit) {
  1729. HostTensorGenerator<> gen;
  1730. auto graph = ComputingGraph::make();
  1731. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1732. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1733. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1734. z = opr::Convolution::make(x, y);
  1735. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1736. ASSERT_EQ(
  1737. std::numeric_limits<uint64_t>::max(),
  1738. conv.execution_policy_transient().workspace_limit);
  1739. gopt::set_opr_algo_workspace_limit_inplace({z + 2.3f}, 10000u);
  1740. ASSERT_EQ(10000u, conv.execution_policy().workspace_limit);
  1741. }
  1742. TEST_PASS(FuseConvBiasNonlinPass, Basic) {
  1743. auto cn = CompNode::load("xpux");
  1744. HostTensorGenerator<dtype::Int8> gen;
  1745. auto graph = ComputingGraph::make();
  1746. graph->options().graph_opt_level = 0;
  1747. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1748. return opr::TypeCvt::make(
  1749. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  1750. };
  1751. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1752. return opr::TypeCvt::make(
  1753. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  1754. dtype);
  1755. };
  1756. for (auto format :
  1757. {opr::Convolution::Param::Format::NCHW, opr::Convolution::Param::Format::NHWC,
  1758. opr::Convolution::Param::Format::NCHW4}) {
  1759. opr::Convolution::Param param;
  1760. param.format = format;
  1761. SymbolVar x, w, b;
  1762. if (format == opr::Convolution::Param::Format::NHWC) {
  1763. x = mkvar("x", {20, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1764. w = mkcvar("w1", {24, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1765. b = mkcvar("b", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  1766. } else if (format == opr::Convolution::Param::Format::NCHW) {
  1767. x = mkvar("x", {20, 4, 20, 20}, dtype::QuantizedS8(2.5f)),
  1768. w = mkcvar("w1", {24, 4, 1, 1}, dtype::QuantizedS8(2.5f)),
  1769. b = mkcvar("b", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
  1770. } else {
  1771. mgb_assert(format == opr::Convolution::Param::Format::NCHW4);
  1772. x = mkvar("x", {20, 1, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1773. w = mkcvar("w1", {24, 1, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1774. b = mkcvar("b", {1, 6, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  1775. }
  1776. auto y = opr::Convolution::make(x, w, param);
  1777. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  1778. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  1779. opr::ConvBias::Param conv_bias_param;
  1780. conv_bias_param.format = format;
  1781. conv_bias_param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1782. auto concret_y = opr::ConvBias::make(
  1783. x, w, b, conv_bias_param, {},
  1784. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1785. check(concret_y, y);
  1786. }
  1787. }
  1788. #if MGB_CUDA
  1789. TEST(TestEnableTensorCore, SmallInputShape) {
  1790. REQUIRE_GPU(1);
  1791. auto cn = CompNode::load("gpu0");
  1792. cn.activate();
  1793. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1794. auto sm_ver = prop.major * 10 + prop.minor;
  1795. if (sm_ver < 75) {
  1796. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1797. "expected: %d)\n",
  1798. sm_ver, 75);
  1799. return;
  1800. }
  1801. HostTensorGenerator<dtype::Int8> gen;
  1802. auto graph = ComputingGraph::make();
  1803. graph->options().graph_opt_level = 0;
  1804. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1805. return opr::TypeCvt::make(
  1806. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  1807. };
  1808. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1809. return opr::TypeCvt::make(
  1810. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  1811. dtype);
  1812. };
  1813. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  1814. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1815. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1816. z = mkcvar("b1", {32, 16, 2, 4, 4}, dtype::QuantizedS8(2.5f));
  1817. opr::ConvBias::Param param;
  1818. param.format = opr::ConvBias::Param::Format::NCHW4;
  1819. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1820. param.stride_h = param.stride_w = 2;
  1821. param.pad_h = param.pad_w = 1;
  1822. auto y = opr::ConvBias::make(
  1823. x, w, b, z, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1824. y = opr::ConvBias::make(
  1825. y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1826. y = opr::TypeCvt::make(y, dtype::Float32());
  1827. SymbolVar y_opt;
  1828. SymbolVar y_no_tc;
  1829. {
  1830. auto options = gopt::OptimizeForInferenceOptions{};
  1831. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1832. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1833. }
  1834. {
  1835. auto options = gopt::OptimizeForInferenceOptions{};
  1836. options.enable_fuse_conv_bias_nonlinearity();
  1837. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1838. }
  1839. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1840. ASSERT_EQ(2u, nr_dimshuffle);
  1841. HostTensorND host_y, host_y_opt;
  1842. auto func = graph->compile(
  1843. {make_callback_copy(y_no_tc, host_y),
  1844. make_callback_copy(y_opt, host_y_opt)});
  1845. func->execute();
  1846. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1847. }
  1848. TEST(TestEnableTensorCore, Nchw4Nchw) {
  1849. REQUIRE_GPU(1);
  1850. auto cn = CompNode::load("gpu0");
  1851. cn.activate();
  1852. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1853. auto sm_ver = prop.major * 10 + prop.minor;
  1854. if (sm_ver < 75) {
  1855. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1856. "expected: %d)\n",
  1857. sm_ver, 75);
  1858. return;
  1859. }
  1860. HostTensorGenerator<dtype::Int8> gen;
  1861. auto graph = ComputingGraph::make();
  1862. graph->options().graph_opt_level = 0;
  1863. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1864. return opr::TypeCvt::make(
  1865. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  1866. };
  1867. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1868. return opr::TypeCvt::make(
  1869. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  1870. dtype);
  1871. };
  1872. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C, size_t H,
  1873. size_t W) -> TensorShape {
  1874. mgb_assert(C % 4 == 0);
  1875. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1876. return {N, C / 4, H, W, 4};
  1877. } else {
  1878. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1879. return {N, C, H, W};
  1880. }
  1881. };
  1882. for (auto format :
  1883. {opr::ConvBias::Param::Format::NCHW, opr::ConvBias::Param::Format::NCHW4}) {
  1884. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16), dtype::QuantizedS8(2.5f)),
  1885. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3), dtype::QuantizedS8(2.5f)),
  1886. b = mkcvar("b", mkshape(format, 1, 64, 1, 1), dtype::QuantizedS32(6.25f)),
  1887. z = mkcvar("b1", mkshape(format, 32, 64, 8, 8), dtype::QuantizedS8(2.5f));
  1888. opr::ConvBias::Param param;
  1889. param.format = format;
  1890. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1891. param.stride_h = param.stride_w = 2;
  1892. param.pad_h = param.pad_w = 1;
  1893. auto y = opr::ConvBias::make(
  1894. x, w, b, z, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1895. y = opr::ConvBias::make(
  1896. y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1897. y = opr::TypeCvt::make(y, dtype::Float32());
  1898. SymbolVar y_opt;
  1899. SymbolVar y_no_tc;
  1900. {
  1901. auto options = gopt::OptimizeForInferenceOptions{};
  1902. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1903. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1904. }
  1905. {
  1906. auto options = gopt::OptimizeForInferenceOptions{};
  1907. options.enable_fuse_conv_bias_nonlinearity();
  1908. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1909. }
  1910. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1911. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1912. #if CUDA_VERSION >= 10020
  1913. //! try_conv_reformat_nchw322nchw4 used when cuda_version >= 10020
  1914. ASSERT_EQ(1u, nr_dimshuffle);
  1915. #else
  1916. ASSERT_EQ(2u, nr_dimshuffle);
  1917. #endif
  1918. } else {
  1919. ASSERT_EQ(2u, nr_dimshuffle);
  1920. }
  1921. std::string json_name;
  1922. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1923. json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json";
  1924. } else {
  1925. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1926. json_name = "TestGoptInference.Nchw4Nchw.NCHW.json";
  1927. }
  1928. graph->compile({{y_opt, {}}})
  1929. ->to_json()
  1930. ->writeto_fpath(output_file(json_name.c_str()));
  1931. HostTensorND host_y, host_y_opt;
  1932. auto func = graph->compile(
  1933. {make_callback_copy(y_no_tc, host_y),
  1934. make_callback_copy(y_opt, host_y_opt)});
  1935. func->execute();
  1936. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1937. }
  1938. }
  1939. TEST(TestEnableTensorCore, ConvBiasWithZ) {
  1940. REQUIRE_GPU(1);
  1941. auto cn = CompNode::load("gpu0");
  1942. cn.activate();
  1943. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1944. auto sm_ver = prop.major * 10 + prop.minor;
  1945. if (sm_ver < 75) {
  1946. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1947. "expected: %d)\n",
  1948. sm_ver, 75);
  1949. return;
  1950. }
  1951. HostTensorGenerator<dtype::Int8> gen;
  1952. auto graph = ComputingGraph::make();
  1953. graph->options().graph_opt_level = 0;
  1954. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1955. return opr::TypeCvt::make(
  1956. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  1957. };
  1958. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  1959. return opr::TypeCvt::make(
  1960. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  1961. dtype);
  1962. };
  1963. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1964. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1965. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1966. z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1967. opr::ConvBias::Param param;
  1968. param.format = opr::ConvBias::Param::Format::NCHW4;
  1969. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1970. param.stride_h = param.stride_w = 1;
  1971. param.pad_h = param.pad_w = 1;
  1972. auto y = opr::ConvBias::make(
  1973. x, w, b, z, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1974. y = opr::TypeCvt::make(y, dtype::Float32());
  1975. SymbolVar y_opt;
  1976. SymbolVar y_no_tc;
  1977. {
  1978. auto options = gopt::OptimizeForInferenceOptions{};
  1979. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1980. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1981. }
  1982. {
  1983. auto options = gopt::OptimizeForInferenceOptions{};
  1984. options.enable_fuse_conv_bias_nonlinearity();
  1985. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1986. }
  1987. HostTensorND host_y, host_y_opt;
  1988. auto func = graph->compile(
  1989. {make_callback_copy(y_no_tc, host_y),
  1990. make_callback_copy(y_opt, host_y_opt)});
  1991. func->execute();
  1992. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1993. }
  1994. TEST(TestEnableTensorCore, Pooling) {
  1995. REQUIRE_GPU(1);
  1996. auto cn = CompNode::load("gpu0");
  1997. cn.activate();
  1998. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1999. auto sm_ver = prop.major * 10 + prop.minor;
  2000. if (sm_ver < 75) {
  2001. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2002. "expected: %d)\n",
  2003. sm_ver, 75);
  2004. return;
  2005. }
  2006. HostTensorGenerator<dtype::Int8> gen;
  2007. auto graph = ComputingGraph::make();
  2008. graph->options().graph_opt_level = 0;
  2009. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2010. return opr::TypeCvt::make(
  2011. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2012. };
  2013. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2014. return opr::TypeCvt::make(
  2015. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2016. dtype);
  2017. };
  2018. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2019. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2020. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2021. z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  2022. opr::ConvBias::Param param;
  2023. param.format = opr::ConvBias::Param::Format::NCHW4;
  2024. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2025. param.stride_h = param.stride_w = 1;
  2026. param.pad_h = param.pad_w = 1;
  2027. auto y = opr::ConvBias::make(
  2028. x, w, b, z, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2029. opr::Pooling::Param pool_param;
  2030. pool_param.format = opr::Pooling::Param::Format::NCHW4;
  2031. y = opr::Pooling::make(y, pool_param);
  2032. y = opr::TypeCvt::make(y, dtype::Float32());
  2033. SymbolVar y_opt;
  2034. SymbolVar y_no_tc;
  2035. {
  2036. auto options = gopt::OptimizeForInferenceOptions{};
  2037. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  2038. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2039. }
  2040. ASSERT_EQ(
  2041. opr::Pooling::Param::Format::NCHW32,
  2042. find_opr<opr::Pooling>(y_opt).param().format);
  2043. {
  2044. auto options = gopt::OptimizeForInferenceOptions{};
  2045. options.enable_fuse_conv_bias_nonlinearity();
  2046. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  2047. }
  2048. HostTensorND host_y, host_y_opt;
  2049. auto func = graph->compile(
  2050. {make_callback_copy(y_no_tc, host_y),
  2051. make_callback_copy(y_opt, host_y_opt)});
  2052. func->execute();
  2053. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2054. }
  2055. TEST(TestEnableTensorCore, BatchConvBias) {
  2056. REQUIRE_GPU(1);
  2057. auto cn = CompNode::load("gpu0");
  2058. cn.activate();
  2059. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2060. auto sm_ver = prop.major * 10 + prop.minor;
  2061. if (sm_ver < 75) {
  2062. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2063. "expected: %d)\n",
  2064. sm_ver, 75);
  2065. return;
  2066. }
  2067. HostTensorGenerator<dtype::Int8> gen;
  2068. auto graph = ComputingGraph::make();
  2069. graph->options().graph_opt_level = 0;
  2070. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2071. return opr::TypeCvt::make(
  2072. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2073. };
  2074. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2075. return opr::TypeCvt::make(
  2076. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2077. dtype);
  2078. };
  2079. auto inp = mkvar("inp", {32, 24, 24, 24, 4}, dtype::QuantizedS8(1.1f)),
  2080. flt = mkcvar("flt", {32, 96, 24, 1, 1, 4}, dtype::QuantizedS8(1.2f)),
  2081. bias = mkcvar("bias", {1, 24, 1, 1, 4}, dtype::QuantizedS32{1.1f * 1.2f});
  2082. opr::BatchConvBias::Param param;
  2083. param.format = opr::BatchConvBias::Param::Format::NCHW4;
  2084. param.stride_h = param.stride_w = 1;
  2085. param.pad_h = param.pad_w = 0;
  2086. auto y = opr::BatchConvBias::make(
  2087. inp, flt, bias, param, {}, OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
  2088. y = opr::TypeCvt::make(y, dtype::Float32());
  2089. SymbolVar y_opt;
  2090. SymbolVar y_no_tc;
  2091. {
  2092. auto options = gopt::OptimizeForInferenceOptions{};
  2093. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  2094. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2095. }
  2096. ASSERT_EQ(
  2097. opr::BatchConvBias::Param::Format::NCHW4,
  2098. find_opr<opr::BatchConvBias>(y_opt).param().format);
  2099. {
  2100. auto options = gopt::OptimizeForInferenceOptions{};
  2101. options.enable_fuse_conv_bias_nonlinearity();
  2102. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  2103. }
  2104. HostTensorND host_y, host_y_opt;
  2105. auto func = graph->compile(
  2106. {make_callback_copy(y_no_tc, host_y),
  2107. make_callback_copy(y_opt, host_y_opt)});
  2108. func->execute();
  2109. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2110. }
  2111. TEST(TestGoptInference, EnableTensorCore) {
  2112. REQUIRE_GPU(1);
  2113. auto cn = CompNode::load("gpu0");
  2114. cn.activate();
  2115. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2116. auto sm_ver = prop.major * 10 + prop.minor;
  2117. if (sm_ver < 75) {
  2118. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2119. "expected: %d)\n",
  2120. sm_ver, 75);
  2121. return;
  2122. }
  2123. HostTensorGenerator<dtype::Int8> gen;
  2124. auto graph = ComputingGraph::make();
  2125. graph->options().graph_opt_level = 0;
  2126. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2127. return opr::TypeCvt::make(
  2128. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2129. };
  2130. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2131. return opr::TypeCvt::make(
  2132. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2133. dtype);
  2134. };
  2135. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2136. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2137. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2138. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  2139. opr::Convolution::Param param;
  2140. param.format = opr::Convolution::Param::Format::NCHW4;
  2141. param.stride_h = param.stride_w = 1;
  2142. param.pad_h = param.pad_w = 1;
  2143. auto y = opr::Convolution::make(x, w, param);
  2144. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  2145. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  2146. auto y1 = y + b1, y2 = opr::Convolution::make(y, w, param),
  2147. y3 = opr::Elemwise::make({y - b1}, opr::Elemwise::Param::Mode::RELU);
  2148. y2 = opr::Elemwise::make({y2 + b}, opr::Elemwise::Param::Mode::RELU),
  2149. y2 = opr::TypeCvt::make(y2, dtype::QuantizedS8(2.5f));
  2150. auto y4 = y1 + y2 + y3;
  2151. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2152. SymbolVar y_opt;
  2153. SymbolVar y_no_tc;
  2154. {
  2155. auto options = gopt::OptimizeForInferenceOptions{};
  2156. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  2157. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  2158. }
  2159. {
  2160. auto options = gopt::OptimizeForInferenceOptions{};
  2161. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  2162. unpack_vector(gopt::optimize_for_inference({y4}, options), y_no_tc);
  2163. }
  2164. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2165. ASSERT_EQ(3u, nr_dimshuffle);
  2166. graph->compile({{y_opt, {}}})
  2167. ->to_json()
  2168. ->writeto_fpath(output_file("TestGoptInference.EnableTensorCorePass.json"));
  2169. HostTensorND host_y, host_y_opt;
  2170. auto func = graph->compile(
  2171. {make_callback_copy(y_no_tc, host_y),
  2172. make_callback_copy(y_opt, host_y_opt)});
  2173. func->execute();
  2174. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2175. }
  2176. TEST(FuseConvBiasZPass, BlockFuse) {
  2177. REQUIRE_GPU(1);
  2178. auto cn = CompNode::load("gpu0");
  2179. cn.activate();
  2180. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2181. auto sm_ver = prop.major * 10 + prop.minor;
  2182. if (sm_ver < 61) {
  2183. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2184. "expected: %d)\n",
  2185. sm_ver, 61);
  2186. return;
  2187. }
  2188. HostTensorGenerator<dtype::Int8> gen;
  2189. auto graph = ComputingGraph::make();
  2190. graph->options().graph_opt_level = 0;
  2191. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2192. return opr::TypeCvt::make(
  2193. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2194. };
  2195. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2196. return opr::TypeCvt::make(
  2197. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2198. dtype);
  2199. };
  2200. using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
  2201. using NonlineMode = opr::ConvBias::Param::NonlineMode;
  2202. for (auto mode :
  2203. {ElemMultiMode::QFUSE_ADD_RELU, ElemMultiMode::QFUSE_ADD_H_SWISH}) {
  2204. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2205. w1 = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2206. b1 = mkcvar("b1", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2207. w2 = mkcvar("w2", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2208. b2 = mkcvar("b2", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2209. w3 = mkcvar("w3", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2210. b3 = mkcvar("b3", {1, 16, 1, 1, 4}, dtype::QuantizedS32(3.0f));
  2211. NonlineMode nonline_mode = NonlineMode::RELU;
  2212. if (mode == ElemMultiMode::QFUSE_ADD_H_SWISH) {
  2213. nonline_mode = NonlineMode::H_SWISH;
  2214. }
  2215. opr::ConvBias::Param param;
  2216. param.format = opr::Convolution::Param::Format::NCHW4;
  2217. param.nonlineMode = nonline_mode;
  2218. param.stride_h = param.stride_w = 1;
  2219. param.pad_h = param.pad_w = 1;
  2220. auto y1 = opr::ConvBias::make(
  2221. x, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2222. param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
  2223. auto y2 = opr::ConvBias::make(
  2224. y1, w2, b2, param, {},
  2225. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  2226. y3 = opr::ElemwiseMultiType::make(
  2227. {y1, y2}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  2228. param.nonlineMode = nonline_mode;
  2229. auto y4 = opr::ConvBias::make(
  2230. y3, w3, b3, param, {},
  2231. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  2232. z = opr::ElemwiseMultiType::make(
  2233. {y3, y4}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  2234. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2235. z = opr::TypeCvt::make(z, dtype::Float32());
  2236. SymbolVar z_fuse;
  2237. {
  2238. auto options = gopt::OptimizeForInferenceOptions{};
  2239. options.enable_fuse_conv_bias_nonlinearity().enable_fuse_conv_bias_with_z();
  2240. unpack_vector(gopt::optimize_for_inference({z}, options), z_fuse);
  2241. }
  2242. graph->compile({{z_fuse, {}}})
  2243. ->to_json()
  2244. ->writeto_fpath(output_file("FuseConvBiasZPass.BlockFuse_fuse.json"));
  2245. auto nr_elem_multi_type = find_opr_num<mgb::opr::ElemwiseMultiType>(z_fuse);
  2246. MGB_MARK_USED_VAR(nr_elem_multi_type);
  2247. #if MGB_CUDA && (CUDNN_MAJOR == 8)
  2248. ASSERT_EQ(2u, nr_elem_multi_type);
  2249. #else
  2250. ASSERT_EQ(1u, nr_elem_multi_type);
  2251. //! fuse z mannually
  2252. auto z0 = opr::ConvBias::make(
  2253. x, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2254. auto z1 = opr::ConvBias::make(
  2255. z0, w2, b2, z0, param, {},
  2256. OperatorNodeConfig{dtype::QuantizedS8(1.2f)}),
  2257. z2 = opr::ConvBias::make(
  2258. z1, w3, b3, param, {},
  2259. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  2260. z4 = opr::ElemwiseMultiType::make(
  2261. {z1, z2}, {opr::ElemwiseMultiType::Mode::QADD},
  2262. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2263. z4 = opr::TypeCvt::make(z4, dtype::Float32());
  2264. SymbolVar z_nonfuse;
  2265. {
  2266. auto options = gopt::OptimizeForInferenceOptions{};
  2267. options.enable_fuse_conv_bias_nonlinearity();
  2268. unpack_vector(gopt::optimize_for_inference({z4}, options), z_nonfuse);
  2269. }
  2270. graph->compile({{z_nonfuse, {}}})
  2271. ->to_json()
  2272. ->writeto_fpath(
  2273. output_file("FuseConvBiasZPass.BlockFuse_nonfuse.json"));
  2274. HostTensorND host_z_fuse, host_z_nonfuse;
  2275. auto func = graph->compile(
  2276. {make_callback_copy(z_nonfuse, host_z_nonfuse),
  2277. make_callback_copy(z_fuse, host_z_fuse)});
  2278. func->execute();
  2279. MGB_ASSERT_TENSOR_EQ(host_z_fuse, host_z_nonfuse);
  2280. #endif
  2281. }
  2282. }
  2283. TEST(TestEnableTensorCore, ShuffleMerge) {
  2284. REQUIRE_GPU(1);
  2285. auto cn = CompNode::load("gpu0");
  2286. cn.activate();
  2287. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2288. auto sm_ver = prop.major * 10 + prop.minor;
  2289. if (sm_ver < 75) {
  2290. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2291. "expected: %d)\n",
  2292. sm_ver, 75);
  2293. return;
  2294. }
  2295. HostTensorGenerator<dtype::Int8> gen;
  2296. auto graph = ComputingGraph::make();
  2297. graph->options().graph_opt_level = 0;
  2298. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2299. return opr::TypeCvt::make(
  2300. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2301. };
  2302. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2303. return opr::TypeCvt::make(
  2304. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2305. dtype);
  2306. };
  2307. auto nchw2nchw4 = [](SymbolVar x) {
  2308. auto xshp = opr::GetVarShape::make(x);
  2309. auto cv = [&x](int v) { return x.make_scalar(v); };
  2310. auto sub = [&xshp, &cv](int idx) {
  2311. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2312. };
  2313. auto tshp = opr::Concat::make({sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2314. auto y0 = opr::Reshape::make(x, tshp);
  2315. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2316. return y1;
  2317. };
  2318. auto nchw42nchw = [](SymbolVar x) {
  2319. auto xshp = opr::GetVarShape::make(x);
  2320. auto cv = [&x](int v) { return x.make_scalar(v); };
  2321. auto sub = [&xshp, &cv](int idx) {
  2322. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2323. };
  2324. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2325. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2326. auto y1 = opr::Reshape::make(y0, tshp);
  2327. return y1;
  2328. };
  2329. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  2330. w = mkcvar("w1", {64, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
  2331. b = mkcvar("b", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)),
  2332. z = mkvar("b1", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f));
  2333. x = nchw2nchw4(x), w = nchw2nchw4(w), b = nchw2nchw4(b), z = nchw2nchw4(z);
  2334. opr::ConvBias::Param param;
  2335. param.format = opr::ConvBias::Param::Format::NCHW4;
  2336. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2337. param.stride_h = param.stride_w = 1;
  2338. param.pad_h = param.pad_w = 1;
  2339. auto y = opr::ConvBias::make(
  2340. x, w, b, z, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2341. y = nchw42nchw(y);
  2342. y = opr::TypeCvt::make(y, dtype::Float32());
  2343. SymbolVar y_opt;
  2344. SymbolVar y_no_tc;
  2345. {
  2346. auto options = gopt::OptimizeForInferenceOptions{};
  2347. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  2348. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2349. }
  2350. {
  2351. auto options = gopt::OptimizeForInferenceOptions{};
  2352. options.enable_fuse_conv_bias_nonlinearity();
  2353. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  2354. }
  2355. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2356. ASSERT_EQ(3u, nr_dimshuffle);
  2357. HostTensorND host_y, host_y_opt;
  2358. auto func = graph->compile(
  2359. {make_callback_copy(y_no_tc, host_y),
  2360. make_callback_copy(y_opt, host_y_opt)});
  2361. func->execute();
  2362. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2363. }
  2364. #endif
  2365. TEST(FuseConvBiasZPass, Basic) {
  2366. REQUIRE_GPU(1);
  2367. auto cn = CompNode::load("gpu0");
  2368. HostTensorGenerator<dtype::Int8> gen;
  2369. auto graph = ComputingGraph::make();
  2370. graph->options().graph_opt_level = 0;
  2371. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2372. return opr::TypeCvt::make(
  2373. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2374. };
  2375. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2376. return opr::TypeCvt::make(
  2377. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2378. dtype);
  2379. };
  2380. auto format = opr::Convolution::Param::Format::NCHW4;
  2381. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2382. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2383. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2384. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2385. b2 = mkvar("b2", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  2386. opr::ConvBias::Param conv_bias_param;
  2387. conv_bias_param.format = format;
  2388. conv_bias_param.stride_h = conv_bias_param.stride_w = 1;
  2389. conv_bias_param.pad_h = conv_bias_param.pad_w = 1;
  2390. auto y = opr::ConvBias::make(
  2391. x, w, b, conv_bias_param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2392. SymbolVar y_opt;
  2393. // check fuse mode
  2394. for (auto mode :
  2395. {opr::ElemwiseMultiType::Param::Mode::QADD,
  2396. opr::ElemwiseMultiType::Param::Mode::QMUL,
  2397. opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) {
  2398. auto y1 = opr::ElemwiseMultiType::make(
  2399. {y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2400. {
  2401. auto options = gopt::OptimizeForInferenceOptions{};
  2402. options.enable_fuse_conv_bias_nonlinearity()
  2403. .enable_fuse_conv_bias_with_z()
  2404. .enable_nchw32();
  2405. unpack_vector(gopt::optimize_for_inference({y1}, options), y_opt);
  2406. }
  2407. auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt);
  2408. if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) {
  2409. ASSERT_NE(0u, nr_elemwisemultitype);
  2410. } else
  2411. ASSERT_EQ(0u, nr_elemwisemultitype);
  2412. // fuse convbiasz and z
  2413. if (mode == opr::ElemwiseMultiType::Param::Mode::QADD) {
  2414. auto y2 = opr::ElemwiseMultiType::make(
  2415. {y1, b2}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  2416. {
  2417. auto options = gopt::OptimizeForInferenceOptions{};
  2418. options.enable_fuse_conv_bias_nonlinearity()
  2419. .enable_fuse_conv_bias_with_z()
  2420. .enable_nchw32();
  2421. unpack_vector(gopt::optimize_for_inference({y2}, options), y_opt);
  2422. }
  2423. auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt);
  2424. ASSERT_NE(0u, nr_elemwisemultitype);
  2425. }
  2426. }
  2427. }
  2428. #if MGB_CUDA
  2429. //! close for cu111 ci, reopen it when bug fixed
  2430. #if CUDA_VERSION < 11000
  2431. TEST(TestGoptInference, EnableCHWN4) {
  2432. REQUIRE_GPU(1);
  2433. auto cn = CompNode::load("gpu0");
  2434. cn.activate();
  2435. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2436. auto sm_ver = prop.major * 10 + prop.minor;
  2437. if (sm_ver < 61) {
  2438. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2439. "expected: %d)\n",
  2440. sm_ver, 61);
  2441. return;
  2442. }
  2443. HostTensorGenerator<dtype::Int8> gen;
  2444. auto graph = ComputingGraph::make();
  2445. graph->options().graph_opt_level = 0;
  2446. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2447. return opr::TypeCvt::make(
  2448. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2449. };
  2450. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2451. return opr::TypeCvt::make(
  2452. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2453. dtype);
  2454. };
  2455. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C, size_t H,
  2456. size_t W) -> TensorShape {
  2457. mgb_assert(C % 4 == 0);
  2458. if (format == opr::ConvBias::Param::Format::NCHW4) {
  2459. return {N, C / 4, H, W, 4};
  2460. } else {
  2461. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  2462. return {N, C, H, W};
  2463. }
  2464. };
  2465. for (auto format :
  2466. {opr::ConvBias::Param::Format::NCHW, opr::ConvBias::Param::Format::NCHW4}) {
  2467. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16), dtype::QuantizedS8(2.5f)),
  2468. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3), dtype::QuantizedS8(2.5f)),
  2469. b = mkcvar("b", mkshape(format, 1, 64, 1, 1), dtype::QuantizedS32(6.25f)),
  2470. b1 = mkvar(
  2471. "b1", mkshape(format, 32, 64, 16, 16), dtype::QuantizedS8(2.5f));
  2472. opr::ConvBias::Param param;
  2473. param.format = format;
  2474. param.stride_h = param.stride_w = 1;
  2475. param.pad_h = param.pad_w = 1;
  2476. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2477. auto y = opr::ConvBiasForward::make(
  2478. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2479. auto y1 = opr::ElemwiseMultiType::make(
  2480. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2481. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2482. auto y2 = opr::ConvBiasForward::make(
  2483. y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2484. auto y3 = opr::ElemwiseMultiType::make(
  2485. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2486. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2487. auto y4 = opr::ElemwiseMultiType::make(
  2488. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2489. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2490. y4 = opr::ElemwiseMultiType::make(
  2491. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2492. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2493. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2494. SymbolVar y_opt;
  2495. SymbolVar y_cudnn;
  2496. {
  2497. auto options = gopt::OptimizeForInferenceOptions{};
  2498. options.enable_chwn4();
  2499. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  2500. }
  2501. unpack_vector(
  2502. gopt::GraphOptimizer{}
  2503. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2504. .add_pass<gopt::FuseConvBiasZPass>()
  2505. .apply({{y4}})
  2506. .endpoint_vars(),
  2507. y_cudnn);
  2508. ASSERT_EQ(
  2509. opr::ConvBias::Param::Format::CHWN4,
  2510. find_opr<opr::ConvBias>(y_opt).param().format);
  2511. HostTensorND host_y, host_y_opt;
  2512. auto func = graph->compile(
  2513. {make_callback_copy(y_cudnn, host_y),
  2514. make_callback_copy(y_opt, host_y_opt)});
  2515. func->execute();
  2516. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2517. }
  2518. }
  2519. #endif
  2520. TEST(TestGoptInference, EnableCHWN4WarpPespective) {
  2521. REQUIRE_GPU(1);
  2522. auto cn = CompNode::load("gpu0");
  2523. cn.activate();
  2524. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2525. auto sm_ver = prop.major * 10 + prop.minor;
  2526. if (sm_ver < 61) {
  2527. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2528. "expected: %d)\n",
  2529. sm_ver, 61);
  2530. return;
  2531. }
  2532. HostTensorGenerator<dtype::Int8> gen;
  2533. auto graph = ComputingGraph::make();
  2534. graph->options().graph_opt_level = 0;
  2535. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2536. return opr::TypeCvt::make(
  2537. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2538. };
  2539. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2540. return opr::TypeCvt::make(
  2541. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2542. dtype);
  2543. };
  2544. std::shared_ptr<HostTensorND> mat =
  2545. std::make_shared<HostTensorND>(cn, TensorShape{32, 3, 3}, dtype::Float32());
  2546. warp_perspective_mat_gen(*mat, 32, 16, 16);
  2547. auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat");
  2548. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2549. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2550. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2551. opr::ConvBias::Param param;
  2552. param.format = opr::ConvBias::Param::Format::NCHW4;
  2553. param.stride_h = param.stride_w = 1;
  2554. param.pad_h = param.pad_w = 1;
  2555. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2556. auto y = opr::ConvBiasForward::make(
  2557. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2558. opr::WarpPerspective::Param warp_param;
  2559. warp_param.format = opr::WarpPerspective::Param::Format::NCHW4;
  2560. auto y1 = opr::WarpPerspective::make(y, mat_var, TensorShape{16, 16}, warp_param);
  2561. y1 = opr::TypeCvt::make(y1, dtype::Float32());
  2562. auto nchw42nchw = [](SymbolVar x) {
  2563. auto xshp = opr::GetVarShape::make(x);
  2564. auto cv = [&x](int v) { return x.make_scalar(v); };
  2565. auto sub = [&xshp, &cv](int idx) {
  2566. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2567. };
  2568. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2569. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2570. auto y1 = opr::Reshape::make(y0, tshp);
  2571. return y1;
  2572. };
  2573. y1 = nchw42nchw(y1);
  2574. warp_param.format = opr::WarpPerspective::Param::Format::NCHW;
  2575. auto y2 = opr::WarpPerspective::make(y1, mat_var, TensorShape{16, 16}, warp_param);
  2576. SymbolVar y_opt;
  2577. SymbolVar y_cudnn;
  2578. {
  2579. auto options = gopt::OptimizeForInferenceOptions{};
  2580. options.enable_chwn4();
  2581. unpack_vector(gopt::optimize_for_inference({y2}, options), y_opt);
  2582. }
  2583. unpack_vector(
  2584. gopt::GraphOptimizer{}
  2585. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2586. .add_pass<gopt::FuseConvBiasZPass>()
  2587. .apply({{y2}})
  2588. .endpoint_vars(),
  2589. y_cudnn);
  2590. HostTensorND host_y, host_y_opt;
  2591. auto func = graph->compile(
  2592. {make_callback_copy(y_cudnn, host_y),
  2593. make_callback_copy(y_opt, host_y_opt)});
  2594. func->execute();
  2595. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2596. }
  2597. TEST(TestGoptInference, EnableCHWN4Pooling) {
  2598. REQUIRE_GPU(1);
  2599. auto cn = CompNode::load("gpu0");
  2600. cn.activate();
  2601. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2602. auto sm_ver = prop.major * 10 + prop.minor;
  2603. if (sm_ver < 61) {
  2604. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2605. "expected: %d)\n",
  2606. sm_ver, 61);
  2607. return;
  2608. }
  2609. HostTensorGenerator<dtype::Int8> gen;
  2610. auto graph = ComputingGraph::make();
  2611. graph->options().graph_opt_level = 0;
  2612. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2613. return opr::TypeCvt::make(
  2614. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2615. };
  2616. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2617. return opr::TypeCvt::make(
  2618. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2619. dtype);
  2620. };
  2621. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2622. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2623. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2624. opr::ConvBias::Param param;
  2625. param.format = opr::ConvBias::Param::Format::NCHW4;
  2626. param.stride_h = param.stride_w = 1;
  2627. param.pad_h = param.pad_w = 1;
  2628. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2629. auto y = opr::ConvBiasForward::make(
  2630. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2631. opr::Pooling::Param pool_param;
  2632. pool_param.format = opr::Pooling::Param::Format::NCHW4;
  2633. y = opr::Pooling::make(y, pool_param);
  2634. y = opr::TypeCvt::make(y, dtype::Float32());
  2635. auto nchw42nchw = [](SymbolVar x) {
  2636. auto xshp = opr::GetVarShape::make(x);
  2637. auto cv = [&x](int v) { return x.make_scalar(v); };
  2638. auto sub = [&xshp, &cv](int idx) {
  2639. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2640. };
  2641. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2642. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2643. auto y1 = opr::Reshape::make(y0, tshp);
  2644. return y1;
  2645. };
  2646. y = nchw42nchw(y);
  2647. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2648. auto y1 = opr::Pooling::make(y, pool_param);
  2649. SymbolVar y_opt;
  2650. SymbolVar y_cudnn;
  2651. unpack_vector(
  2652. gopt::GraphOptimizer{}
  2653. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2654. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2655. .add_pass<gopt::FuseConvBiasZPass>()
  2656. .apply({{y1}})
  2657. .endpoint_vars(),
  2658. y_opt);
  2659. unpack_vector(
  2660. gopt::GraphOptimizer{}
  2661. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2662. .add_pass<gopt::FuseConvBiasZPass>()
  2663. .apply({{y1}})
  2664. .endpoint_vars(),
  2665. y_cudnn);
  2666. HostTensorND host_y, host_y_opt;
  2667. auto func = graph->compile(
  2668. {make_callback_copy(y_cudnn, host_y),
  2669. make_callback_copy(y_opt, host_y_opt)});
  2670. func->execute();
  2671. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2672. }
  2673. TEST(TestGoptInference, EnableCHWN4ShuffleRemove) {
  2674. REQUIRE_GPU(1);
  2675. auto cn = CompNode::load("gpu0");
  2676. cn.activate();
  2677. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2678. auto sm_ver = prop.major * 10 + prop.minor;
  2679. if (sm_ver < 61) {
  2680. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2681. "expected: %d)\n",
  2682. sm_ver, 61);
  2683. return;
  2684. }
  2685. HostTensorGenerator<dtype::Int8> gen;
  2686. auto graph = ComputingGraph::make();
  2687. graph->options().graph_opt_level = 0;
  2688. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2689. return opr::TypeCvt::make(
  2690. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2691. };
  2692. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2693. return opr::TypeCvt::make(
  2694. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2695. dtype);
  2696. };
  2697. auto nchw2nchw4 = [](SymbolVar x) {
  2698. auto xshp = opr::GetVarShape::make(x);
  2699. auto cv = [&x](int v) { return x.make_scalar(v); };
  2700. auto sub = [&xshp, &cv](int idx) {
  2701. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2702. };
  2703. auto tshp = opr::Concat::make({sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2704. auto y0 = opr::Reshape::make(x, tshp);
  2705. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2706. return y1;
  2707. };
  2708. auto nchw42nchw = [](SymbolVar x) {
  2709. auto xshp = opr::GetVarShape::make(x);
  2710. auto cv = [&x](int v) { return x.make_scalar(v); };
  2711. auto sub = [&xshp, &cv](int idx) {
  2712. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2713. };
  2714. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2715. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2716. auto y1 = opr::Reshape::make(y0, tshp);
  2717. return y1;
  2718. };
  2719. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  2720. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2721. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2722. b1 = mkcvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8{2.5f});
  2723. x = nchw2nchw4(x);
  2724. opr::ConvBias::Param param;
  2725. param.format = opr::ConvBias::Param::Format::NCHW4;
  2726. param.stride_h = param.stride_w = 1;
  2727. param.pad_h = param.pad_w = 1;
  2728. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2729. auto y = opr::ConvBiasForward::make(
  2730. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2731. auto y1 = opr::ElemwiseMultiType::make(
  2732. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2733. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2734. auto y2 = opr::ConvBiasForward::make(
  2735. y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2736. auto y3 = opr::ElemwiseMultiType::make(
  2737. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2738. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2739. auto y4 = opr::ElemwiseMultiType::make(
  2740. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2741. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2742. y4 = opr::ElemwiseMultiType::make(
  2743. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2744. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2745. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2746. y4 = nchw42nchw(y4);
  2747. SymbolVar y_opt;
  2748. SymbolVar y_cudnn;
  2749. unpack_vector(
  2750. gopt::GraphOptimizer{}
  2751. .add_pass<gopt::ParamRedistributePass>()
  2752. .add_pass<gopt::ParamFusePass>()
  2753. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2754. .add_pass<gopt::FuseConvBiasZPass>()
  2755. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2756. .add_pass<gopt::ShuffleShuffleRemovePass>()
  2757. .add_pass<gopt::ParamFusePass>()
  2758. .apply({{y4}})
  2759. .endpoint_vars(),
  2760. y_opt);
  2761. graph->compile({{y_opt, {}}})
  2762. ->to_json()
  2763. ->writeto_fpath(
  2764. output_file("TestGoptInference.EnableCHWN4ShuffleRemove.json"));
  2765. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2766. ASSERT_EQ(2u, nr_dimshuffle);
  2767. auto nr_reformat = find_opr_num<mgb::opr::RelayoutFormat>(y_opt);
  2768. ASSERT_EQ(0u, nr_reformat);
  2769. unpack_vector(
  2770. gopt::GraphOptimizer{}
  2771. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2772. .add_pass<gopt::FuseConvBiasZPass>()
  2773. .apply({{y4}})
  2774. .endpoint_vars(),
  2775. y_cudnn);
  2776. HostTensorND host_y, host_y_opt;
  2777. auto func = graph->compile(
  2778. {make_callback_copy(y_cudnn, host_y),
  2779. make_callback_copy(y_opt, host_y_opt)});
  2780. func->execute();
  2781. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2782. }
  2783. TEST(TestGoptInference, ConvertFormatNCHW4GPU) {
  2784. REQUIRE_GPU(1);
  2785. auto cn = CompNode::load("gpu0");
  2786. cn.activate();
  2787. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2788. auto sm_ver = prop.major * 10 + prop.minor;
  2789. if (sm_ver < 61) {
  2790. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2791. "expected: %d)\n",
  2792. sm_ver, 61);
  2793. return;
  2794. }
  2795. HostTensorGenerator<dtype::Int8> gen;
  2796. auto graph = ComputingGraph::make();
  2797. graph->options().graph_opt_level = 0;
  2798. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2799. return opr::TypeCvt::make(
  2800. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2801. };
  2802. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2803. return opr::TypeCvt::make(
  2804. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2805. dtype);
  2806. };
  2807. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2808. opr::ConvBias::Param param_conv_bias;
  2809. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2810. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2811. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2812. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2813. // dense
  2814. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2815. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2816. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2817. auto conv1 = opr::ConvBiasForward::make(
  2818. x, w1, b1, param_conv_bias, {},
  2819. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2820. // group
  2821. // icpg != 1 && ocpg != 1
  2822. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2823. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2824. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2825. auto conv2 = opr::ConvBiasForward::make(
  2826. conv1, w2, b2, param_conv_bias, {},
  2827. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2828. opr::Convolution::Param param_deconv;
  2829. param_deconv.format = opr::Convolution::Param::Format::NCHW;
  2830. param_deconv.stride_h = param_deconv.stride_w = 2;
  2831. param_deconv.pad_h = param_deconv.pad_w = 2;
  2832. // dense
  2833. param_deconv.sparse = opr::Convolution::Param::Sparse::DENSE;
  2834. auto w3 = mkcvar("w3", {8, 8, 4, 4}, dtype::QuantizedS8(2.5f));
  2835. auto deconv1 = opr::ConvolutionBackwardData::make_deconv(
  2836. conv2, w3, param_deconv, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2837. auto deconv1_fp32 = opr::TypeCvt::make(deconv1, dtype::Float32());
  2838. auto y = deconv1_fp32 + opr::TypeCvt::make(b2, dtype::Float32());
  2839. SymbolVar y_opt;
  2840. {
  2841. auto options = gopt::OptimizeForInferenceOptions{};
  2842. options.enable_nchw4();
  2843. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2844. }
  2845. ASSERT_EQ(
  2846. opr::ConvBias::Param::Format::NCHW4,
  2847. find_opr<opr::ConvBias>(y_opt).param().format);
  2848. ASSERT_EQ(
  2849. opr::ConvolutionBackwardData::Param::Format::NCHW4,
  2850. find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
  2851. auto nr_reshape = find_opr_num<mgb::opr::Reshape>(y_opt);
  2852. ASSERT_EQ(2u, nr_reshape);
  2853. graph->compile({{y_opt, {}}})
  2854. ->to_json()
  2855. ->writeto_fpath(
  2856. output_file("TestGoptInference.ConvertFormatNCHW4GPU.json"));
  2857. HostTensorND host_y, host_y_opt;
  2858. auto func = graph->compile(
  2859. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  2860. func->execute();
  2861. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2862. }
  2863. TEST(TestGoptInference, ConvertFormatNCHW4FloatGPU) {
  2864. REQUIRE_GPU(1);
  2865. auto cn = CompNode::load("gpu0");
  2866. cn.activate();
  2867. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(6, 1);
  2868. HostTensorGenerator<> gen;
  2869. auto graph = ComputingGraph::make();
  2870. graph->options().graph_opt_level = 0;
  2871. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2872. return opr::TypeCvt::make(
  2873. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2874. };
  2875. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2876. return opr::TypeCvt::make(
  2877. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2878. dtype);
  2879. };
  2880. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(1.2f));
  2881. opr::ConvBias::Param param_conv_bias;
  2882. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2883. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2884. // conv1, with bias
  2885. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)),
  2886. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::Float32());
  2887. auto conv1 = opr::ConvBias::make(
  2888. x, w1, b1, param_conv_bias, {}, OperatorNodeConfig{dtype::Float32()});
  2889. // conv2, with bias and z
  2890. auto w2 = mkcvar("w2", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)),
  2891. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::Float32()),
  2892. z2 = mkcvar("z2", {2, 8, 16, 16}, dtype::Float32());
  2893. auto conv2 = opr::ConvBias::make(
  2894. x, w2, b2, z2, param_conv_bias, {}, OperatorNodeConfig{dtype::Float32()});
  2895. // conv3, relu
  2896. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2897. auto w3 = mkcvar("w3", {8, 4, 3, 3}, dtype::QuantizedS8(1.3f)),
  2898. b3 = mkcvar("b3", {1, 8, 1, 1}, dtype::Float32()),
  2899. z3 = mkcvar("z3", {2, 8, 16, 16}, dtype::Float32());
  2900. auto conv3 = opr::ConvBias::make(
  2901. x, w3, b3, z3, param_conv_bias, {}, OperatorNodeConfig{dtype::Float32()});
  2902. auto y = conv1 + conv2 + conv3;
  2903. SymbolVar y_opt;
  2904. {
  2905. auto options = gopt::OptimizeForInferenceOptions{};
  2906. options.enable_nchw4();
  2907. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2908. }
  2909. bool succ = true;
  2910. auto cb = [&succ](cg::OperatorNodeBase* opr) {
  2911. if (opr->same_type<opr::ConvBias>()) {
  2912. auto& conv_bias = opr->cast_final_safe<opr::ConvBias>();
  2913. if (conv_bias.param().format != opr::ConvBias::Param::Format::NCHW4_NCHW) {
  2914. succ = false;
  2915. }
  2916. }
  2917. };
  2918. cg::DepOprIter{cb}.add(y_opt);
  2919. ASSERT_TRUE(succ);
  2920. HostTensorND host_y, host_y_opt;
  2921. auto func = graph->compile(
  2922. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  2923. func->execute();
  2924. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  2925. }
  2926. #endif
  2927. TEST(TestGoptInference, ConvertFormatNCHW4NonConvOpr) {
  2928. auto cn = CompNode::load("xpu0");
  2929. HostTensorGenerator<dtype::Int8> gen;
  2930. auto graph = ComputingGraph::make();
  2931. graph->options().graph_opt_level = 0;
  2932. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2933. return opr::TypeCvt::make(
  2934. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  2935. };
  2936. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  2937. return opr::TypeCvt::make(
  2938. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  2939. dtype);
  2940. };
  2941. auto mkcvarf32 = [&](const char* name, const TensorShape& shp) {
  2942. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  2943. };
  2944. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2945. opr::ConvBias::Param param_conv_bias;
  2946. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2947. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2948. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2949. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2950. // dense
  2951. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2952. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2953. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2954. auto conv1 = opr::ConvBiasForward::make(
  2955. x, w1, b1, param_conv_bias, {},
  2956. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2957. // test Resize
  2958. auto shape_of = opr::GetVarShape::make(x);
  2959. auto subtensor = opr::Subtensor::make(
  2960. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2961. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2962. opr::Resize::Param param_resize;
  2963. param_resize.format = opr::Resize::Param::Format::NCHW;
  2964. auto resize = opr::ResizeForward::make(conv1, subtensor * 2, param_resize);
  2965. // test WarpPerspective
  2966. auto mat = mkcvarf32("mat", {2, 3, 3}),
  2967. warp = opr::WarpPerspectiveForward::make(
  2968. resize, mat, nullptr, cg::var_from_tensor_shape(x, {32, 32}));
  2969. opr::Pooling::Param pool_param;
  2970. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2971. // test Pooling
  2972. auto pool = opr::Pooling::make(warp, pool_param);
  2973. // group
  2974. // icpg != 1 && ocpg != 1
  2975. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2976. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2977. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2978. auto conv2 = opr::ConvBiasForward::make(
  2979. pool, w2, b2, param_conv_bias, {},
  2980. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2981. auto add = opr::ElemwiseMultiType::make(
  2982. {conv1, conv2}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  2983. OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
  2984. auto y = opr::TypeCvt::make(add, dtype::Float32());
  2985. SymbolVar y_opt;
  2986. {
  2987. auto options = gopt::OptimizeForInferenceOptions{};
  2988. options.enable_nchw4();
  2989. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2990. }
  2991. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2992. ASSERT_EQ(2u, nr_dimshuffle);
  2993. ASSERT_EQ(
  2994. opr::ConvBias::Param::Format::NCHW4,
  2995. find_opr<opr::ConvBias>(y_opt).param().format);
  2996. ASSERT_EQ(
  2997. opr::ResizeForward::Param::Format::NCHW4,
  2998. find_opr<opr::ResizeForward>(y_opt).param().format);
  2999. ASSERT_EQ(
  3000. opr::WarpPerspectiveForward::Param::Format::NCHW4,
  3001. find_opr<opr::WarpPerspectiveForward>(y_opt).param().format);
  3002. ASSERT_EQ(
  3003. opr::PoolingForward::Param::Format::NCHW4,
  3004. find_opr<opr::PoolingForward>(y_opt).param().format);
  3005. }
  3006. TEST(TestGoptInference, ConvertFormatNCHW4) {
  3007. HostTensorGenerator<> gen;
  3008. auto cn = CompNode::load("cpu0");
  3009. auto graph = ComputingGraph::make();
  3010. graph->options().graph_opt_level = 0;
  3011. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3012. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3013. };
  3014. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3015. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  3016. };
  3017. auto x = mkvar("x", {2, 4, 16, 16});
  3018. // ConvBias test dense
  3019. opr::ConvBias::Param param_conv_bias;
  3020. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3021. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3022. auto w1 = mkcvar("w1", {8, 4, 3, 3}), b1 = mkcvar("b1", {1, 8, 1, 1});
  3023. auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
  3024. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3025. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1});
  3026. auto conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  3027. // Convolution
  3028. opr::Convolution::Param param_conv;
  3029. param_conv.pad_h = param_conv.pad_w = 1;
  3030. param_conv.sparse = opr::Convolution::Param::Sparse::DENSE;
  3031. auto w3 = mkcvar("w3", {8, 8, 3, 3});
  3032. auto y = opr::Convolution::make(conv2, w3, param_conv);
  3033. SymbolVar y_opt;
  3034. {
  3035. auto options = gopt::OptimizeForInferenceOptions{};
  3036. options.enable_nchw4();
  3037. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3038. }
  3039. ASSERT_EQ(
  3040. opr::ConvBias::Param::Format::NCHW,
  3041. find_opr<opr::ConvBias>(y_opt).param().format);
  3042. graph->compile({{y_opt, {}}})
  3043. ->to_json()
  3044. ->writeto_fpath(output_file("TestGoptInference.ConvertFormatNCHW4.json"));
  3045. HostTensorND host_y_opt, host_y;
  3046. auto func = graph->compile(
  3047. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3048. func->execute();
  3049. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  3050. }
  3051. TEST(TestGoptInference, ConvertFormatNCHW4Ic3) {
  3052. REQUIRE_GPU(1);
  3053. auto cn = CompNode::load("gpu0");
  3054. cn.activate();
  3055. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  3056. HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{
  3057. 1.2f, 127 * 127};
  3058. auto graph = ComputingGraph::make();
  3059. graph->options().graph_opt_level = 0;
  3060. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3061. return opr::TypeCvt::make(
  3062. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name), dtype);
  3063. };
  3064. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3065. return opr::TypeCvt::make(
  3066. opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name), dtype);
  3067. };
  3068. auto x = mkvar("x", {2, 3, 16, 16}, dtype::QuantizedS8(2.5f));
  3069. // ConvBias test dense
  3070. opr::ConvBias::Param param_conv_bias;
  3071. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3072. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3073. auto w1 = mkcvar("w1", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  3074. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  3075. auto conv1 = opr::ConvBias::make(
  3076. x, w1, b1, param_conv_bias, {},
  3077. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  3078. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3079. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  3080. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  3081. auto conv2 = opr::ConvBias::make(
  3082. conv1, w2, b2, param_conv_bias, {},
  3083. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  3084. auto y = opr::TypeCvt::make(conv2, dtype::Float32());
  3085. SymbolVar y_opt;
  3086. {
  3087. auto options = gopt::OptimizeForInferenceOptions{};
  3088. options.enable_nchw4();
  3089. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3090. }
  3091. ASSERT_EQ(
  3092. opr::ConvBias::Param::Format::NCHW4,
  3093. find_opr<opr::ConvBias>(y_opt).param().format);
  3094. graph->compile({{y_opt, {}}})
  3095. ->to_json()
  3096. ->writeto_fpath(
  3097. output_file("TestGoptInference.ConvertFormatNCHW4Ic3.json"));
  3098. HostTensorND host_y_opt, host_y;
  3099. auto func = graph->compile(
  3100. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3101. func->execute();
  3102. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  3103. }
  3104. TEST(TestGoptInference, ConvertFormatNCHW88) {
  3105. HostTensorGenerator<> gen;
  3106. auto cn = CompNode::load("cpu0");
  3107. auto graph = ComputingGraph::make();
  3108. graph->options().graph_opt_level = 0;
  3109. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3110. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3111. };
  3112. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3113. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  3114. };
  3115. auto host_x = gen({2, 3, 16, 16}, cn);
  3116. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  3117. //! Hybrid nchw88 mode
  3118. opr::Convolution::Param param_conv;
  3119. param_conv.pad_h = param_conv.pad_w = 1;
  3120. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  3121. conv1 = opr::Convolution::make(
  3122. x, w1, param_conv, {}, OperatorNodeConfig("conv1"));
  3123. //! channel wise
  3124. opr::ConvBias::Param param_conv_bias;
  3125. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3126. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3127. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  3128. conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  3129. //! group
  3130. auto w3 = mkcvar("w3", {1, 8, 8, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  3131. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  3132. //! reduce
  3133. opr::Reduce::Param param_reduce1;
  3134. param_reduce1.axis = 2;
  3135. param_reduce1.mode = opr::Reduce::Mode::SUM;
  3136. opr::Reduce::Param param_reduce2;
  3137. param_reduce2.axis = 0;
  3138. param_reduce2.mode = opr::Reduce::Mode::MAX;
  3139. auto reduce1 = conv3 + opr::Reduce::make(conv3, param_reduce1) +
  3140. opr::Reduce::make(conv3, param_reduce2);
  3141. auto shape_of = opr::GetVarShape::make(reduce1);
  3142. auto subtensor = opr::Subtensor::make(
  3143. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  3144. 0, x.make_scalar(2), None, x.make_scalar(1))});
  3145. opr::Resize::Param param_resize;
  3146. param_resize.format = opr::Resize::Param::Format::NCHW;
  3147. auto resize = opr::ResizeForward::make(reduce1, subtensor * 2, param_resize);
  3148. auto mat = mkcvar("mat", {2, 3, 3}),
  3149. warp = opr::WarpPerspectiveForward::make(
  3150. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  3151. auto b = mkvar("b", {1, 8, 1, 1}),
  3152. elem = opr::Elemwise::make({warp + b}, opr::Elemwise::Param::Mode::RELU);
  3153. //! Dense
  3154. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3155. auto w4 = mkcvar("w4", {2, 6, 4, 3, 3}), b4 = mkcvar("b4", {1, 12, 1, 1}),
  3156. conv4 = opr::ConvBias::make(elem, w4, b4, param_conv_bias);
  3157. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3158. auto w5 = mkcvar("w5", {8, 12, 3, 3}), b5 = mkcvar("b5", {1, 8, 1, 1}),
  3159. conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias);
  3160. auto w6 = mkcvar("w6", {8, 8, 3, 3}), b6 = mkcvar("b6", {1, 8, 1, 1}),
  3161. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
  3162. SymbolVar y_opt;
  3163. {
  3164. auto options = gopt::OptimizeForInferenceOptions{};
  3165. options.enable_nchw88();
  3166. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3167. }
  3168. ASSERT_EQ(
  3169. opr::ConvBias::Param::Format::NCHW88,
  3170. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  3171. ASSERT_EQ(
  3172. opr::ConvBias::Param::Format::NCHW88,
  3173. find_opr<opr::ConvBias>(y_opt).param().format);
  3174. graph->compile({{y_opt, {}}})
  3175. ->to_json()
  3176. ->writeto_fpath(output_file("TestGoptInference.ConvertFormatNCHW88.json"));
  3177. HostTensorND host_y_opt, host_y;
  3178. auto func = graph->compile(
  3179. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3180. func->execute();
  3181. //! meybe go to winograd in x86-32, so set error 1e-1
  3182. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3183. *host_x = *gen({2, 3, 32, 32}, cn);
  3184. func->execute();
  3185. //! meybe go to winograd in x86-32, so set error 1e-1
  3186. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3187. }
  3188. TEST(TestGoptInference, ConvertFormatNCHW44) {
  3189. HostTensorGenerator<> gen;
  3190. auto cn = CompNode::load("cpu0");
  3191. auto graph = ComputingGraph::make();
  3192. graph->options().graph_opt_level = 0;
  3193. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3194. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3195. };
  3196. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3197. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  3198. };
  3199. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  3200. const DType& dtype) {
  3201. return opr::TypeCvt::make(
  3202. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  3203. dtype);
  3204. };
  3205. auto host_x = gen({2, 3, 16, 16}, cn);
  3206. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  3207. //! Hybrid nchw44 mode
  3208. opr::Convolution::Param param_conv;
  3209. param_conv.pad_h = param_conv.pad_w = 1;
  3210. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  3211. conv1 = opr::Convolution::make(
  3212. x, w1, param_conv, {}, OperatorNodeConfig("conv1"));
  3213. //! no supported hybrid nchw44
  3214. opr::ConvBias::Param param_conv_bias_pad0;
  3215. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  3216. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  3217. auto conv1_f1 = opr::ConvBias::make(
  3218. x, w1_f1, param_conv_bias_pad0, {}, OperatorNodeConfig("conv1_f1"));
  3219. auto conv1_add = conv1_f1 * conv1;
  3220. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  3221. //! s8 dense conv
  3222. opr::ConvBias::Param param_conv_bias;
  3223. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3224. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  3225. auto b1_2 = mkcvar_dtype("b1_2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  3226. auto conv_1_2 = opr::ConvBias::make(
  3227. conv_1_q8, w1_2, b1_2, param_conv_bias, {},
  3228. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  3229. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  3230. //! channel wise
  3231. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3232. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  3233. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  3234. //! group
  3235. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  3236. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  3237. //! reduce
  3238. opr::Reduce::Param param_reduce1;
  3239. param_reduce1.axis = 1;
  3240. param_reduce1.mode = opr::Reduce::Mode::MIN;
  3241. opr::Reduce::Param param_reduce2;
  3242. param_reduce2.axis = 3;
  3243. param_reduce2.mode = opr::Reduce::Mode::SUM_SQR;
  3244. auto reduce1 = conv3 + opr::Reduce::make(conv3, param_reduce1) +
  3245. opr::Reduce::make(conv3, param_reduce2);
  3246. auto shape_of = opr::GetVarShape::make(reduce1);
  3247. auto subtensor = opr::Subtensor::make(
  3248. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  3249. 0, x.make_scalar(2), None, x.make_scalar(1))});
  3250. opr::Resize::Param param_resize;
  3251. param_resize.format = opr::Resize::Param::Format::NCHW;
  3252. auto resize = opr::ResizeForward::make(reduce1, subtensor * 2, param_resize);
  3253. auto mat = mkcvar("mat", {2, 3, 3}),
  3254. warp = opr::WarpPerspectiveForward::make(
  3255. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  3256. auto b = mkvar("b", {1, 8, 1, 1}),
  3257. elem = opr::Elemwise::make({warp + b}, opr::Elemwise::Param::Mode::RELU);
  3258. //! Dense
  3259. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3260. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3261. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}), b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  3262. conv3_2 = opr::ConvBias::make(
  3263. elem, w3_2, b3_2, param_conv_bias, {}, OperatorNodeConfig("conv3_2"));
  3264. //! s8 group conv
  3265. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3266. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  3267. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  3268. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  3269. conv3_3_q = opr::ConvBias::make(
  3270. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  3271. OperatorNodeConfig{"conv_3_3_q", cn, dtype::QuantizedS8{6.25f}});
  3272. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  3273. //! Dense
  3274. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3275. auto w4 = mkcvar("w4", {16, 32, 3, 3}), b4 = mkcvar("b4", {1, 16, 1, 1}),
  3276. conv4 = opr::ConvBias::make(
  3277. conv3_3, w4, b4, param_conv_bias, {}, OperatorNodeConfig("conv4"));
  3278. auto w4_1 = mkcvar("w4_1", {16, 32, 1, 1}), b4_1 = mkcvar("b4_1", {2, 16, 4, 4}),
  3279. conv4_1 = opr::ConvBias::make(
  3280. conv3_3, w4_1, b4_1, param_conv_bias_pad0, {},
  3281. OperatorNodeConfig("conv4_1"));
  3282. auto conv4_add = conv4 + conv4_1;
  3283. auto w5 = mkcvar("w5", {6, 16, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  3284. conv5 = opr::ConvBias::make(
  3285. conv4_add, w5, b5, param_conv_bias, {}, OperatorNodeConfig("conv5"));
  3286. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  3287. y = opr::ConvBias::make(
  3288. conv5, w6, b6, param_conv_bias, {}, OperatorNodeConfig("conv6"));
  3289. SymbolVar y_opt;
  3290. auto options = gopt::OptimizeForInferenceOptions{};
  3291. options.enable_fuse_conv_bias_nonlinearity();
  3292. options.enable_nchw44();
  3293. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3294. ASSERT_EQ(
  3295. opr::Convolution::Param::Format::NCHW44,
  3296. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  3297. ASSERT_EQ(
  3298. opr::Convolution::Param::Format::NCHW,
  3299. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  3300. ASSERT_EQ(
  3301. opr::Convolution::Param::Format::NCHW44,
  3302. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  3303. ASSERT_EQ(
  3304. opr::Convolution::Param::Format::NCHW44,
  3305. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  3306. ASSERT_EQ(
  3307. opr::Convolution::Param::Format::NCHW44,
  3308. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  3309. ASSERT_EQ(
  3310. opr::Convolution::Param::Format::NCHW44,
  3311. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  3312. ASSERT_EQ(
  3313. opr::Convolution::Param::Format::NCHW,
  3314. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  3315. graph->compile({{y_opt, {}}})
  3316. ->to_json()
  3317. ->writeto_fpath(output_file("TestGoptInference.ConvertFormatNCHW44.json"));
  3318. HostTensorND host_y_opt, host_y;
  3319. auto func = graph->compile(
  3320. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3321. func->execute();
  3322. //! meybe go to winograd in x86-32, so set error 1e-1
  3323. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3324. *host_x = *gen({2, 3, 32, 32}, cn);
  3325. func->execute();
  3326. //! meybe go to winograd in x86-32, so set error 1e-1
  3327. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3328. }
  3329. TEST(TestGoptInference, ConvertFormatNCHW44MultiInput) {
  3330. HostTensorGenerator<> gen;
  3331. auto cn = CompNode::load("cpu0");
  3332. auto graph = ComputingGraph::make();
  3333. graph->options().graph_opt_level = 0;
  3334. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3335. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3336. };
  3337. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3338. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  3339. };
  3340. auto host_x1 = gen({1, 8, 16, 16}, cn);
  3341. auto host_x2 = gen({1, 1, 16, 16}, cn);
  3342. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3343. opr::Convolution::Param param_conv;
  3344. param_conv.pad_h = param_conv.pad_w = 1;
  3345. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  3346. conv1 = opr::Convolution::make(x, w1, param_conv);
  3347. auto b = mkvar("b", {1, 1, 16, 16}),
  3348. elem0 = opr::Elemwise::make({conv1 + b + b}, opr::Elemwise::Param::Mode::RELU);
  3349. auto w2 = mkcvar("w2", {8, 8, 3, 3}),
  3350. conv2 = opr::Convolution::make(elem0, w2, param_conv);
  3351. auto b1 = mkvar("b1", {1}),
  3352. y = opr::Elemwise::make({conv2 + b1 + b}, opr::Elemwise::Param::Mode::RELU);
  3353. SymbolVar y_opt;
  3354. auto options = gopt::OptimizeForInferenceOptions{};
  3355. options.enable_nchw44();
  3356. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3357. ASSERT_EQ(
  3358. opr::Convolution::Param::Format::NCHW44,
  3359. find_opr<opr::Convolution>(y_opt).param().format);
  3360. graph->compile({{y_opt, {}}})
  3361. ->to_json()
  3362. ->writeto_fpath(output_file(
  3363. "TestGoptInference.ConvertFormatNCHW44MultiInput.json"));
  3364. HostTensorND host_y_opt, host_y;
  3365. auto func = graph->compile(
  3366. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3367. func->execute();
  3368. //! meybe go to winograd in x86-32, so set error 1e-1
  3369. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3370. }
  3371. TEST(TestGoptInference, ConvertFormatNCHW44Reshape) {
  3372. HostTensorGenerator<> gen;
  3373. auto cn = CompNode::load("cpu0");
  3374. auto graph = ComputingGraph::make();
  3375. graph->options().graph_opt_level = 0;
  3376. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3377. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  3378. };
  3379. auto host_x1 = gen({1, 8, 16, 16}, cn);
  3380. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3381. opr::Convolution::Param param_conv;
  3382. param_conv.pad_h = param_conv.pad_w = 1;
  3383. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  3384. conv1 = opr::Convolution::make(x, w1, param_conv);
  3385. auto y = opr::Reshape::make(conv1, {8, 16 * 16});
  3386. SymbolVar y_opt;
  3387. auto options = gopt::OptimizeForInferenceOptions{};
  3388. options.enable_nchw44();
  3389. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3390. ASSERT_EQ(
  3391. opr::Convolution::Param::Format::NCHW44,
  3392. find_opr<opr::Convolution>(y_opt).param().format);
  3393. graph->compile({{y_opt, {}}})
  3394. ->to_json()
  3395. ->writeto_fpath(
  3396. output_file("TestGoptInference.ConvertFormatNCHW44Reshape.json"));
  3397. HostTensorND host_y_opt, host_y;
  3398. auto func = graph->compile(
  3399. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3400. func->execute();
  3401. //! meybe go to winograd in x86-32, so set error 1e-1
  3402. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3403. }
  3404. TEST(TestGoptInference, ConvertFormatNCHW44_DOT) {
  3405. HostTensorGenerator<> gen;
  3406. auto cn = CompNode::load("cpu0");
  3407. auto graph = ComputingGraph::make();
  3408. graph->options().graph_opt_level = 0;
  3409. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3410. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3411. };
  3412. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3413. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  3414. };
  3415. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  3416. const DType& dtype) {
  3417. return opr::TypeCvt::make(
  3418. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  3419. dtype);
  3420. };
  3421. auto host_x = gen({2, 3, 16, 16}, cn);
  3422. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  3423. //! Hybrid nchw44 mode
  3424. opr::Convolution::Param param_conv;
  3425. param_conv.pad_h = param_conv.pad_w = 1;
  3426. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  3427. conv1 = opr::Convolution::make(
  3428. x, w1, param_conv, {}, OperatorNodeConfig("conv1"));
  3429. printf("create conv1 %s\n", conv1.node()->owner_opr()->dyn_typeinfo()->name);
  3430. param_conv.pad_h = param_conv.pad_w = 1;
  3431. //! no supported hybrid nchw44
  3432. opr::ConvBias::Param param_conv_bias_pad0;
  3433. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  3434. auto b1 = mkcvar("b1", {1, 8, 1, 1});
  3435. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  3436. auto conv1_f1 = opr::ConvBias::make(
  3437. x, w1_f1, b1, param_conv_bias_pad0, {}, OperatorNodeConfig("conv1_f1"));
  3438. //! hybrid dot
  3439. auto x_s = opr::TypeCvt::make(x, dtype::QuantizedS8(2.5f));
  3440. auto w1_3 = mkcvar_dtype("w1_3", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f));
  3441. auto conv1_3_q = opr::Convolution::make(
  3442. x_s, w1_3, param_conv, {},
  3443. OperatorNodeConfig{"conv1_3_q", cn, dtype::QuantizedS8{6.25f}});
  3444. auto conv1_3 = opr::TypeCvt::make(conv1_3_q, dtype::Float32());
  3445. auto conv1_add = conv1_f1 * conv1 * conv1_3;
  3446. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  3447. //! s8 dense conv
  3448. opr::ConvBias::Param param_conv_bias;
  3449. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3450. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  3451. auto conv_1_2 = opr::ConvBias::make(
  3452. conv_1_q8, w1_2, param_conv_bias, {},
  3453. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  3454. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  3455. //! channel wise
  3456. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3457. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  3458. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  3459. //! group
  3460. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  3461. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  3462. auto shape_of = opr::GetVarShape::make(conv3);
  3463. auto subtensor = opr::Subtensor::make(
  3464. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  3465. 0, x.make_scalar(2), None, x.make_scalar(1))});
  3466. opr::Resize::Param param_resize;
  3467. param_resize.format = opr::Resize::Param::Format::NCHW;
  3468. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  3469. auto mat = mkcvar("mat", {2, 3, 3}),
  3470. warp = opr::WarpPerspectiveForward::make(
  3471. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  3472. auto b = mkvar("b", {1, 8, 1, 1}),
  3473. elem = opr::Elemwise::make({warp + b}, opr::Elemwise::Param::Mode::RELU);
  3474. //! Dense
  3475. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3476. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3477. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}), b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  3478. conv3_2 = opr::ConvBias::make(
  3479. elem, w3_2, b3_2, param_conv_bias, {}, OperatorNodeConfig("conv3_2"));
  3480. //! s8 group conv
  3481. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3482. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  3483. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  3484. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  3485. conv3_3_q = opr::ConvBias::make(
  3486. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  3487. OperatorNodeConfig{"conv_3_3_q", cn, dtype::QuantizedS8{6.25f}});
  3488. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  3489. //! Dense
  3490. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  3491. auto w4 = mkcvar("w4", {4, 32, 3, 3}), b4 = mkcvar("b4", {1, 4, 1, 1}),
  3492. conv4 = opr::ConvBias::make(
  3493. conv3_3, w4, b4, param_conv_bias, {}, OperatorNodeConfig("conv4"));
  3494. auto w5 = mkcvar("w5", {6, 4, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  3495. conv5 = opr::ConvBias::make(
  3496. conv4, w5, b5, param_conv_bias, {}, OperatorNodeConfig("conv5"));
  3497. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  3498. y = opr::ConvBias::make(
  3499. conv5, w6, b6, param_conv_bias, {}, OperatorNodeConfig("conv6"));
  3500. SymbolVar y_opt;
  3501. auto options = gopt::OptimizeForInferenceOptions{};
  3502. options.enable_fuse_conv_bias_nonlinearity();
  3503. options.enable_nchw44_dot();
  3504. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3505. ASSERT_EQ(
  3506. opr::Convolution::Param::Format::NCHW44,
  3507. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  3508. ASSERT_EQ(
  3509. opr::Convolution::Param::Format::NCHW44_DOT,
  3510. find_opr<opr::Convolution>(y_opt, "conv1_3_q").param().format);
  3511. ASSERT_EQ(
  3512. opr::Convolution::Param::Format::NCHW,
  3513. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  3514. ASSERT_EQ(
  3515. opr::Convolution::Param::Format::NCHW44_DOT,
  3516. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  3517. ASSERT_EQ(
  3518. opr::Convolution::Param::Format::NCHW44,
  3519. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  3520. ASSERT_EQ(
  3521. opr::Convolution::Param::Format::NCHW44_DOT,
  3522. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  3523. ASSERT_EQ(
  3524. opr::Convolution::Param::Format::NCHW44,
  3525. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  3526. ASSERT_EQ(
  3527. opr::Convolution::Param::Format::NCHW,
  3528. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  3529. graph->compile({{y_opt, {}}})
  3530. ->to_json()
  3531. ->writeto_fpath(
  3532. output_file("TestGoptInference.ConvertFormatNCHW44_DOT.json"));
  3533. HostTensorND host_y_opt, host_y;
  3534. auto func = graph->compile(
  3535. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3536. func->execute();
  3537. //! meybe go to winograd in x86-32, so set error 1e-1
  3538. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3539. *host_x = *gen({2, 3, 32, 32}, cn);
  3540. func->execute();
  3541. //! meybe go to winograd in x86-32, so set error 1e-1
  3542. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  3543. }
  3544. TEST(TestGoptInference, ConvertFormatCD4GroupOneConv) {
  3545. // hwcd4 is only supported in naive handle
  3546. NaiveMegDNNHandleScope naive_megdnn_handle;
  3547. HostTensorGenerator<> gen;
  3548. auto cn = CompNode::load("cpu0");
  3549. auto graph = ComputingGraph::make();
  3550. graph->options().graph_opt_level = 0;
  3551. auto mkvar = [&](const char* name, const TensorShape& shp) {
  3552. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  3553. };
  3554. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  3555. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
  3556. };
  3557. auto x = mkvar("x", {1, 3, 128, 128});
  3558. // ConvBias
  3559. opr::ConvBias::Param param_conv_bias;
  3560. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  3561. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3562. auto w1 = mkcvar("w1", {1, 16, 3, 3, 3}), b1 = mkcvar("b1", {1, 16, 1, 1});
  3563. auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
  3564. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  3565. // Convolution
  3566. opr::Convolution::Param param_conv;
  3567. param_conv.pad_h = param_conv.pad_w = 1;
  3568. param_conv.sparse = opr::Convolution::Param::Sparse::GROUP;
  3569. auto w3 = mkcvar("w3", {1, 16, 16, 3, 3});
  3570. auto y = opr::Convolution::make(conv1, w3, param_conv);
  3571. SymbolVar y_opt;
  3572. {
  3573. auto options = gopt::OptimizeForInferenceOptions{};
  3574. options.enable_nhwcd4();
  3575. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3576. }
  3577. HostTensorND host_y_opt, host_y;
  3578. auto func = graph->compile(
  3579. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3580. func->execute();
  3581. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  3582. }
  3583. #if MGB_CUDA
  3584. TEST(TestGoptInference, PreProcessCase0) {
  3585. REQUIRE_GPU(1);
  3586. HostTensorGenerator<dtype::Quantized8Asymm, RandomDistribution::UNIFORM> gen(
  3587. dt_quint8(0), dt_quint8(50), 1, 128, 1234);
  3588. auto cn = CompNode::load("gpu0");
  3589. auto graph = ComputingGraph::make();
  3590. graph->options().graph_opt_level = 0;
  3591. size_t n = 1;
  3592. size_t c = 3;
  3593. size_t h = 16;
  3594. size_t w = 16;
  3595. auto host_x1 = gen({n, c, h, w}, cn);
  3596. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3597. auto x_q8 = opr::TypeCvt::make(x, dtype::QuantizedS8(1.f), cn);
  3598. auto zero = DTypeScalar(dtype::QuantizedS8(1.f));
  3599. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3600. auto pad_channel_tensor = opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3601. auto paded_x = opr::Concat::make({x_q8, pad_channel_tensor}, 1, cn)
  3602. .reshape({n, 1, 4, h, w});
  3603. auto result = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3604. auto y = result;
  3605. SymbolVar y_opt;
  3606. auto options = gopt::OptimizeForInferenceOptions{};
  3607. options.enable_fuse_preprocess();
  3608. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3609. graph->compile({{y_opt, {}}})
  3610. ->to_json()
  3611. ->writeto_fpath(output_file("TestGoptInference.PreProcessCase0.json"));
  3612. HostTensorND host_y_opt, host_y;
  3613. auto func = graph->compile(
  3614. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3615. func->execute();
  3616. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3617. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
  3618. }
  3619. TEST(TestGoptInference, PreProcessCase1) {
  3620. REQUIRE_GPU(1);
  3621. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3622. auto cn = CompNode::load("gpu0");
  3623. auto graph = ComputingGraph::make();
  3624. graph->options().graph_opt_level = 0;
  3625. size_t n = 1;
  3626. size_t c = 3;
  3627. size_t h = 16;
  3628. size_t w = 16;
  3629. auto host_x1 = gen({n, c, h, w}, cn);
  3630. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3631. auto x_u8 = opr::TypeCvt::make(x, dtype::Float32(), cn);
  3632. auto x_s8 = x_u8 - 128;
  3633. auto zero = DTypeScalar(dtype::Float32());
  3634. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3635. auto pad_channel_tensor = opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3636. auto paded_x = opr::Concat::make({x_s8, pad_channel_tensor}, 1, cn)
  3637. .reshape({n, 1, 4, h, w});
  3638. auto nchw4_out = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3639. auto result = opr::TypeCvt::make(nchw4_out, dtype::QuantizedS8(1.f));
  3640. auto y = result;
  3641. SymbolVar y_opt;
  3642. auto options = gopt::OptimizeForInferenceOptions{};
  3643. options.enable_fuse_preprocess();
  3644. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3645. graph->compile({{y_opt, {}}})
  3646. ->to_json()
  3647. ->writeto_fpath(output_file("TestGoptInference.PreProcessCase1.json"));
  3648. HostTensorND host_y_opt, host_y;
  3649. auto func = graph->compile(
  3650. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3651. func->execute();
  3652. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3653. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
  3654. }
  3655. TEST(TestGoptInference, WarpAndPreProcessCase0) {
  3656. REQUIRE_GPU(1);
  3657. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3658. auto cn = CompNode::load("gpu0");
  3659. auto graph = ComputingGraph::make();
  3660. graph->options().graph_opt_level = 0;
  3661. size_t n = 1;
  3662. size_t c = 3;
  3663. size_t h = 16;
  3664. size_t w = 16;
  3665. auto host_x1 = gen({n, h, w, c}, cn);
  3666. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3667. auto mat_host =
  3668. std::make_shared<HostTensorND>(cn, TensorShape{n, 3, 3}, dtype::Float32());
  3669. warp_perspective_mat_gen(*mat_host, n, h, w);
  3670. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  3671. opr::WarpPerspective::Param warp_param;
  3672. warp_param.format = opr::WarpPerspective::Param::Format::NHWC;
  3673. auto x_warp = opr::WarpPerspective::make(x, mat, TensorShape{h, w}, warp_param);
  3674. auto x_nchw = opr::Dimshuffle::make(x_warp, {0, 3, 1, 2}, 4, cn);
  3675. auto x_u8 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  3676. auto x_s8 = x_u8 - 128;
  3677. auto zero = DTypeScalar(dtype::Float32());
  3678. auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
  3679. auto pad_channel_tensor = opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
  3680. auto paded_x = opr::Concat::make({x_s8, pad_channel_tensor}, 1, cn)
  3681. .reshape({n, 1, 4, h, w});
  3682. auto nchw4_out = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
  3683. auto result = opr::TypeCvt::make(nchw4_out, dtype::QuantizedS8(1.f));
  3684. auto y = result;
  3685. SymbolVar y_opt;
  3686. auto options = gopt::OptimizeForInferenceOptions{};
  3687. options.enable_fuse_preprocess();
  3688. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3689. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::WarpPerspective>());
  3690. ASSERT_EQ(
  3691. opr::WarpPerspective::Param::Format::NHWC_NCHW4_IC_SMALL,
  3692. find_opr<opr::WarpPerspective>(y_opt).param().format);
  3693. graph->compile({{y_opt, {}}})
  3694. ->to_json()
  3695. ->writeto_fpath(
  3696. output_file("TestGoptInference.WarpAndPreProcessCase0.json"));
  3697. HostTensorND host_y_opt, host_y;
  3698. auto func = graph->compile(
  3699. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3700. func->execute();
  3701. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3702. }
  3703. TEST(TestGoptInference, PreProcessCaseAutopadNCHW64) {
  3704. REQUIRE_GPU(1);
  3705. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3706. auto cn = CompNode::load("gpu0");
  3707. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3708. auto sm_ver = prop.major * 10 + prop.minor;
  3709. if (sm_ver < 75) {
  3710. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3711. "expected: %d)\n",
  3712. sm_ver, 75);
  3713. return;
  3714. }
  3715. auto graph = ComputingGraph::make();
  3716. graph->options().graph_opt_level = 0;
  3717. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3718. return opr::TypeCvt::make(
  3719. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  3720. dtype);
  3721. };
  3722. size_t n = 2;
  3723. size_t c = 3;
  3724. size_t h = 32;
  3725. size_t w = 32;
  3726. auto host_x1 = gen({n, c, h, w}, cn);
  3727. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3728. auto x_u8_fp32 = opr::TypeCvt::make(x, dtype::Float32(), cn);
  3729. auto x_s8_fp32 = x_u8_fp32 - 128;
  3730. auto x_s8 = opr::TypeCvt::make(x_s8_fp32, dtype::QuantizedS8(2.5f), cn);
  3731. auto weight = mkcvar("weight", {16, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  3732. bias = mkcvar("bias", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
  3733. opr::ConvBias::Param param;
  3734. param.format = opr::ConvBias::Param::Format::NCHW;
  3735. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3736. param.stride_h = param.stride_w = 2;
  3737. param.pad_h = param.pad_w = 1;
  3738. auto result = opr::ConvBias::make(
  3739. x_s8, weight, bias, param, {},
  3740. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3741. auto y = result;
  3742. SymbolVar y_opt;
  3743. auto options = gopt::OptimizeForInferenceOptions{};
  3744. options.enable_nchw64();
  3745. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3746. graph->compile({{y_opt, {}}})
  3747. ->to_json()
  3748. ->writeto_fpath(
  3749. output_file("TestGoptInference.PreProcessCaseAutopadNCHW64.json"));
  3750. HostTensorND host_y_opt, host_y;
  3751. auto func = graph->compile(
  3752. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3753. func->execute();
  3754. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3755. ASSERT_TRUE(
  3756. find_opr<opr::RelayoutFormat>(y_opt).param().mode ==
  3757. opr::RelayoutFormat::Param::Mode::NCHW_NCHW4);
  3758. }
  3759. TEST(TestGoptInference, PreProcessCaseAutopadNHWC) {
  3760. REQUIRE_GPU(1);
  3761. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3762. auto cn = CompNode::load("gpu0");
  3763. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  3764. auto sm_ver = prop.major * 10 + prop.minor;
  3765. if (sm_ver < 75) {
  3766. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  3767. "expected: %d)\n",
  3768. sm_ver, 75);
  3769. return;
  3770. }
  3771. auto graph = ComputingGraph::make();
  3772. graph->options().graph_opt_level = 0;
  3773. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3774. return opr::TypeCvt::make(
  3775. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  3776. dtype);
  3777. };
  3778. size_t n = 2;
  3779. size_t c = 3;
  3780. size_t h = 32;
  3781. size_t w = 32;
  3782. auto host_x1 = gen({n, c, h, w}, cn);
  3783. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3784. auto x_u8_fp32 = opr::TypeCvt::make(x, dtype::Float32(), cn);
  3785. auto x_s8_fp32 = x_u8_fp32 - 128;
  3786. auto x_s8 = opr::TypeCvt::make(x_s8_fp32, dtype::QuantizedS8(2.5f), cn);
  3787. auto host_val = std::make_shared<HostTensorND>(cn, dtype::QuantizedS8(2.5f));
  3788. TensorShape scalar{1, 1, 1, 1};
  3789. host_val->resize(scalar);
  3790. auto ptr = host_val->raw_ptr();
  3791. size_t size_bytes =
  3792. TensorLayout{scalar, dtype::QuantizedS8(2.5f)}.span().dist_byte();
  3793. std::memset(ptr, 0, size_bytes);
  3794. auto padding = opr::ImmutableTensor::make(*graph, *host_val);
  3795. padding = opr::Broadcast::make(padding, {n, 1, h, w});
  3796. auto padded_x = opr::Concat::make({x_s8, padding}, 1);
  3797. auto nhwc_x = opr::Dimshuffle::make(padded_x, {0, 2, 3, 1});
  3798. auto weight = mkcvar("weight", {16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3799. bias = mkcvar("bias", {1, 1, 1, 16}, dtype::QuantizedS32(6.25f));
  3800. opr::ConvBias::Param param;
  3801. param.format = opr::ConvBias::Param::Format::NHWC;
  3802. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3803. param.stride_h = param.stride_w = 2;
  3804. param.pad_h = param.pad_w = 1;
  3805. auto result = opr::ConvBias::make(
  3806. nhwc_x, weight, bias, param, {},
  3807. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3808. auto y = opr::TypeCvt::make(result, dtype::Float32());
  3809. SymbolVar y_opt;
  3810. auto options = gopt::OptimizeForInferenceOptions{};
  3811. options.enable_fuse_preprocess();
  3812. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3813. graph->compile({{y_opt, {}}})
  3814. ->to_json()
  3815. ->writeto_fpath(
  3816. output_file("TestGoptInference.PreProcessCaseAutopadNHWC.json"));
  3817. HostTensorND host_y_opt, host_y;
  3818. auto func = graph->compile(
  3819. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3820. func->execute();
  3821. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3822. ASSERT_TRUE(
  3823. find_opr<opr::RelayoutFormat>(y_opt).param().mode ==
  3824. opr::RelayoutFormat::Param::Mode::NCHW_NCHW4);
  3825. }
  3826. TEST(TestGoptInference, WarpAndPreProcessCase1) {
  3827. REQUIRE_GPU(1);
  3828. HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
  3829. auto cn = CompNode::load("gpu0");
  3830. auto graph = ComputingGraph::make();
  3831. graph->options().graph_opt_level = 0;
  3832. size_t n = 1;
  3833. size_t c = 3;
  3834. size_t h = 16;
  3835. size_t w = 16;
  3836. auto host_x1 = gen({n, h, w, c}, cn);
  3837. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  3838. auto mat_host =
  3839. std::make_shared<HostTensorND>(cn, TensorShape{n, 3, 3}, dtype::Float32());
  3840. warp_perspective_mat_gen(*mat_host, n, h, w);
  3841. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  3842. opr::WarpPerspective::Param warp_param;
  3843. warp_param.format = opr::WarpPerspective::Param::Format::NHWC;
  3844. auto x_warp = opr::WarpPerspective::make(x, mat, TensorShape{h, w}, warp_param);
  3845. auto x_nchw = opr::Dimshuffle::make(x_warp, {0, 3, 1, 2}, 4, cn);
  3846. auto result = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
  3847. auto y = result;
  3848. SymbolVar y_opt;
  3849. auto options = gopt::OptimizeForInferenceOptions{};
  3850. options.enable_fuse_preprocess();
  3851. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  3852. ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::WarpPerspective>());
  3853. ASSERT_EQ(
  3854. opr::WarpPerspective::Param::Format::NHWC_NCHW,
  3855. find_opr<opr::WarpPerspective>(y_opt).param().format);
  3856. graph->compile({{y_opt, {}}})
  3857. ->to_json()
  3858. ->writeto_fpath(
  3859. output_file("TestGoptInference.WarpAndPreProcessCase1.json"));
  3860. HostTensorND host_y_opt, host_y;
  3861. auto func = graph->compile(
  3862. {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
  3863. func->execute();
  3864. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  3865. }
  3866. #if CUDA_VERSION >= 10020
  3867. TEST(TestGoptInference, FoldingConvDimshuffle) {
  3868. REQUIRE_GPU(1);
  3869. auto cn = CompNode::load("gpu0");
  3870. cn.activate();
  3871. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  3872. HostTensorGenerator<dtype::Int8> gen;
  3873. auto graph = ComputingGraph::make();
  3874. graph->options().graph_opt_level = 0;
  3875. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3876. return opr::TypeCvt::make(
  3877. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  3878. };
  3879. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3880. return opr::TypeCvt::make(
  3881. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  3882. dtype);
  3883. };
  3884. auto nchw42nchw = [](SymbolVar x) {
  3885. auto xshp = opr::GetVarShape::make(x);
  3886. auto cv = [&x](int v) { return x.make_scalar(v); };
  3887. auto sub = [&xshp, &cv](int idx) {
  3888. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  3889. };
  3890. auto tshp0 = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  3891. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  3892. auto y1 = opr::Reshape::make(y0, tshp0);
  3893. return y1;
  3894. };
  3895. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3896. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3897. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3898. opr::ConvBias::Param param;
  3899. param.format = opr::ConvBias::Param::Format::NCHW4;
  3900. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3901. param.stride_h = param.stride_w = 2;
  3902. param.pad_h = param.pad_w = 1;
  3903. auto y = opr::ConvBias::make(
  3904. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3905. y = opr::TypeCvt::make(y, dtype::Float32());
  3906. y = nchw42nchw(y);
  3907. SymbolVar y_fuse, y_non_fuse;
  3908. unpack_vector(
  3909. gopt::GraphOptimizer{}
  3910. .add_pass<gopt::ShuffleShuffleRemovePass>()
  3911. .add_pass<gopt::FoldingConvBiasDimshufflePass>()
  3912. .add_pass<gopt::ParamFusePass>()
  3913. .apply({{y}})
  3914. .endpoint_vars(),
  3915. y_fuse);
  3916. gopt::modify_opr_algo_strategy_inplace(
  3917. {y_fuse},
  3918. opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
  3919. graph->compile({{y_fuse, {}}})
  3920. ->to_json()
  3921. ->writeto_fpath(
  3922. output_file("TestGoptInference.FoldingConvDimshuffle.json"));
  3923. ASSERT_EQ(
  3924. opr::ConvBias::Param::Format::NCHW4_NCHW,
  3925. find_opr<opr::ConvBias>(y_fuse).param().format);
  3926. ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3927. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(), y_non_fuse);
  3928. HostTensorND host_y_fuse, host_y_non_fuse;
  3929. auto func = graph->compile(
  3930. {make_callback_copy(y_fuse, host_y_fuse),
  3931. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  3932. func->execute();
  3933. }
  3934. TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
  3935. REQUIRE_GPU(1);
  3936. auto cn = CompNode::load("gpu0");
  3937. cn.activate();
  3938. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  3939. HostTensorGenerator<dtype::Int8> gen;
  3940. auto graph = ComputingGraph::make();
  3941. graph->options().graph_opt_level = 0;
  3942. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3943. return opr::TypeCvt::make(
  3944. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  3945. };
  3946. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  3947. return opr::TypeCvt::make(
  3948. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  3949. dtype);
  3950. };
  3951. auto nchw42nchw32 = [](SymbolVar x) {
  3952. auto xshp = opr::GetVarShape::make(x);
  3953. auto cv = [&x](int v) { return x.make_scalar(v); };
  3954. auto sub = [&xshp, &cv](int idx) {
  3955. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  3956. };
  3957. auto tshp0 = opr::Concat::make(
  3958. {sub(0), sub(1) / 8, cv(8), sub(2), sub(3), sub(4)}, 0),
  3959. tshp1 = opr::Concat::make(
  3960. {sub(0), sub(1) / 8, sub(2), sub(3), sub(4) * 8}, 0);
  3961. auto y0 = opr::Reshape::make(x, tshp0);
  3962. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2, 5});
  3963. auto y2 = opr::Reshape::make(y1, tshp1);
  3964. return y2;
  3965. };
  3966. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  3967. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  3968. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  3969. opr::ConvBias::Param param;
  3970. param.format = opr::ConvBias::Param::Format::NCHW4;
  3971. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  3972. param.stride_h = param.stride_w = 2;
  3973. param.pad_h = param.pad_w = 1;
  3974. auto y = opr::ConvBias::make(
  3975. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  3976. y = nchw42nchw32(y);
  3977. y = opr::TypeCvt::make(y, dtype::Float32());
  3978. SymbolVar y_fuse, y_non_fuse;
  3979. unpack_vector(
  3980. gopt::GraphOptimizer{}
  3981. .add_pass<gopt::FoldingConvBiasDimshufflePass>()
  3982. .add_pass<gopt::ParamFusePass>()
  3983. .apply({{y}})
  3984. .endpoint_vars(),
  3985. y_fuse);
  3986. gopt::modify_opr_algo_strategy_inplace(
  3987. {y_fuse},
  3988. opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
  3989. graph->compile({{y_fuse, {}}})
  3990. ->to_json()
  3991. ->writeto_fpath(output_file(
  3992. "TestGoptInference.FoldingConvDimshuffleNCHW4NCHW32.json"));
  3993. ASSERT_EQ(
  3994. opr::ConvBias::Param::Format::NCHW4_NCHW32,
  3995. find_opr<opr::ConvBias>(y_fuse).param().format);
  3996. ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
  3997. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(), y_non_fuse);
  3998. HostTensorND host_y_fuse, host_y_non_fuse;
  3999. auto func = graph->compile(
  4000. {make_callback_copy(y_fuse, host_y_fuse),
  4001. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  4002. func->execute();
  4003. MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
  4004. }
  4005. TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
  4006. REQUIRE_GPU(1);
  4007. auto cn = CompNode::load("gpu0");
  4008. cn.activate();
  4009. REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5);
  4010. HostTensorGenerator<dtype::Int8> gen;
  4011. auto graph = ComputingGraph::make();
  4012. graph->options().graph_opt_level = 0;
  4013. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4014. return opr::TypeCvt::make(
  4015. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  4016. };
  4017. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4018. return opr::TypeCvt::make(
  4019. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  4020. dtype);
  4021. };
  4022. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  4023. w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  4024. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  4025. w1 = mkcvar("w1", {16, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  4026. b1 = mkcvar("b1", {1, 4, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  4027. opr::ConvBias::Param param;
  4028. param.format = opr::ConvBias::Param::Format::NCHW4;
  4029. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  4030. param.stride_h = param.stride_w = 2;
  4031. param.pad_h = param.pad_w = 1;
  4032. auto y = opr::ConvBias::make(
  4033. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4034. param.stride_h = param.stride_w = 1;
  4035. y = opr::ConvBias::make(
  4036. y, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4037. y = opr::TypeCvt::make(y, dtype::Float32());
  4038. SymbolVar y_fuse, y_non_fuse;
  4039. {
  4040. auto options = gopt::OptimizeForInferenceOptions{};
  4041. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  4042. unpack_vector(gopt::optimize_for_inference({y}, options), y_fuse);
  4043. }
  4044. graph->compile({{y_fuse, {}}})
  4045. ->to_json()
  4046. ->writeto_fpath(output_file(
  4047. "TestGoptInference.FoldingConvDimshuffleNCHW32NCHW4.json"));
  4048. ASSERT_EQ(1u, find_opr_num<opr::Dimshuffle>(y_fuse));
  4049. bool found = false;
  4050. cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {
  4051. if (!found && opr->same_type<opr::ConvBias>()) {
  4052. opr::ConvBias* cb = &opr->cast_final_safe<opr::ConvBias>();
  4053. if (cb->param().format == opr::ConvBias::Param::Format::NCHW32_NCHW4)
  4054. found = true;
  4055. }
  4056. }}.add(y_fuse.node()->owner_opr());
  4057. EXPECT_TRUE(found);
  4058. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(), y_non_fuse);
  4059. HostTensorND host_y_fuse, host_y_non_fuse;
  4060. auto func = graph->compile(
  4061. {make_callback_copy(y_fuse, host_y_fuse),
  4062. make_callback_copy(y_non_fuse, host_y_non_fuse)});
  4063. func->execute();
  4064. MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
  4065. }
  4066. TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) {
  4067. REQUIRE_GPU(1);
  4068. auto cn = CompNode::load("gpu0");
  4069. cn.activate();
  4070. REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5);
  4071. HostTensorGenerator<dtype::Int8> gen;
  4072. auto graph = ComputingGraph::make();
  4073. graph->options().graph_opt_level = 0;
  4074. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4075. return opr::TypeCvt::make(
  4076. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  4077. };
  4078. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4079. return opr::TypeCvt::make(
  4080. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  4081. dtype);
  4082. };
  4083. auto x = mkvar("x", {32, 4, 23, 40}, dtype::QuantizedS8(2.5f)),
  4084. w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  4085. b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  4086. w1 = mkcvar("w1", {32, 32, 3, 3}, dtype::QuantizedS4(1.234f)),
  4087. b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(12.34567f * 1.234f));
  4088. opr::ConvBias::Param param;
  4089. param.format = opr::ConvBias::Param::Format::NCHW;
  4090. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  4091. param.stride_h = param.stride_w = 1;
  4092. param.pad_h = param.pad_w = 1;
  4093. auto y = opr::ConvBias::make(
  4094. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(12.34567f)});
  4095. y = opr::TypeCvt::make(y, dtype::QuantizedS4(12.34567f));
  4096. y = opr::ConvBias::make(
  4097. y, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS4(56.71234f)});
  4098. y = opr::TypeCvt::make(y, dtype::Float32());
  4099. SymbolVar y_fuse, y_non_fuse;
  4100. {
  4101. auto options = gopt::OptimizeForInferenceOptions{};
  4102. options.enable_nchw64();
  4103. unpack_vector(gopt::optimize_for_inference({y}, options), y_fuse);
  4104. }
  4105. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  4106. S strategy = S::PROFILE;
  4107. gopt::modify_opr_algo_strategy_inplace({y_fuse}, strategy);
  4108. HostTensorND host_y_fuse;
  4109. auto func1 = graph->compile({make_callback_copy(y_fuse, host_y_fuse)});
  4110. func1->execute();
  4111. graph->compile({{y_fuse, {}}})
  4112. ->to_json()
  4113. ->writeto_fpath(output_file(
  4114. "TestGoptInference.FoldingConvDimshuffleNCHW4NHWC.json"));
  4115. size_t nr_dimshuffle = find_opr_num<opr::TypeCvt>(y_fuse);
  4116. ASSERT_EQ(2u, nr_dimshuffle);
  4117. bool found = false;
  4118. cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {
  4119. if (!found && opr->same_type<opr::ConvBias>()) {
  4120. opr::ConvBias* cb = &opr->cast_final_safe<opr::ConvBias>();
  4121. if (cb->param().format == opr::ConvBias::Param::Format::NCHW4_NHWC)
  4122. found = true;
  4123. }
  4124. }}.add(y_fuse.node()->owner_opr());
  4125. EXPECT_TRUE(found);
  4126. unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(), y_non_fuse);
  4127. gopt::modify_opr_algo_strategy_inplace({y_non_fuse}, strategy);
  4128. HostTensorND host_y_non_fuse;
  4129. auto func2 = graph->compile({make_callback_copy(y_non_fuse, host_y_non_fuse)});
  4130. func2->execute();
  4131. MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
  4132. }
  4133. #endif
  4134. TEST(TestGoptInference, PaddingChannels) {
  4135. REQUIRE_GPU(1);
  4136. auto cn = CompNode::load("gpu0");
  4137. cn.activate();
  4138. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  4139. HostTensorGenerator<dtype::Int8> gen;
  4140. auto graph = ComputingGraph::make();
  4141. graph->options().graph_opt_level = 0;
  4142. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4143. return opr::TypeCvt::make(
  4144. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  4145. };
  4146. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4147. return opr::TypeCvt::make(
  4148. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  4149. dtype);
  4150. };
  4151. auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)),
  4152. w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  4153. b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
  4154. opr::ConvBias::Param param;
  4155. param.format = opr::ConvBias::Param::Format::NCHW;
  4156. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  4157. param.stride_h = param.stride_w = 1;
  4158. param.pad_h = param.pad_w = 1;
  4159. auto y = opr::ConvBias::make(
  4160. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4161. auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)),
  4162. b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
  4163. auto y1 = opr::ConvBias::make(
  4164. y, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4165. auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)),
  4166. b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
  4167. auto y2 = opr::ConvBias::make(
  4168. y1, w2, b2, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4169. using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
  4170. auto y3 = opr::ElemwiseMultiType::make(
  4171. {y, y2}, {ElemMultiMode::QFUSE_ADD_RELU},
  4172. OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
  4173. y3 = opr::TypeCvt::make(y3, dtype::Float32());
  4174. SymbolVar y3_pad;
  4175. unpack_vector(
  4176. gopt::GraphOptimizer{}
  4177. .add_pass<gopt::PaddingChannelPass>()
  4178. .apply({{y3}})
  4179. .endpoint_vars(),
  4180. y3_pad);
  4181. ASSERT_EQ(y3_pad.node()->shape()[1], y3.node()->shape()[1]);
  4182. SmallVector<cg::OperatorNodeBase*> oprs;
  4183. auto cb = [&oprs](cg::OperatorNodeBase* opr) {
  4184. if (opr->same_type<opr::ConvBias>()) {
  4185. oprs.push_back(opr);
  4186. }
  4187. };
  4188. cg::DepOprIter{cb}.add(y3_pad.node()->owner_opr());
  4189. ASSERT_EQ(oprs.size(), 3);
  4190. ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32);
  4191. ASSERT_EQ(oprs[1]->output(0)->shape()[1], 32);
  4192. ASSERT_EQ(oprs[2]->output(0)->shape()[1], 32);
  4193. HostTensorND t1, t2;
  4194. auto func1 = graph->compile({make_callback_copy(y3, t1)});
  4195. func1->execute();
  4196. auto func2 = graph->compile({make_callback_copy(y3_pad, t2)});
  4197. func2->execute();
  4198. MGB_ASSERT_TENSOR_EQ(t1, t2);
  4199. }
  4200. TEST(TestGoptInference, ConcatAfterPaddingChannels) {
  4201. REQUIRE_GPU(1);
  4202. auto cn = CompNode::load("gpu0");
  4203. cn.activate();
  4204. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  4205. HostTensorGenerator<dtype::Int8> gen;
  4206. auto graph = ComputingGraph::make();
  4207. graph->options().graph_opt_level = 0;
  4208. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4209. return opr::TypeCvt::make(
  4210. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  4211. };
  4212. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4213. return opr::TypeCvt::make(
  4214. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  4215. dtype);
  4216. };
  4217. auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)),
  4218. w = mkcvar("w", {18, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  4219. b = mkcvar("b", {1, 18, 1, 1}, dtype::QuantizedS32(6.25f));
  4220. opr::ConvBias::Param param;
  4221. param.format = opr::ConvBias::Param::Format::NCHW;
  4222. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  4223. param.stride_h = param.stride_w = 1;
  4224. param.pad_h = param.pad_w = 1;
  4225. auto y = opr::ConvBias::make(
  4226. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4227. auto w1 = mkcvar("w1", {18, 18, 3, 3}, dtype::QuantizedS8(2.5f)),
  4228. b1 = mkcvar("b1", {1, 18, 1, 1}, dtype::QuantizedS32(6.25f));
  4229. auto y1 = opr::ConvBias::make(
  4230. y, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4231. // concat at batch dim
  4232. auto y2 = opr::Concat::make({y, y1}, 0);
  4233. y2 = opr::TypeCvt::make(y2, dtype::Float32());
  4234. SymbolVar y2_pad;
  4235. unpack_vector(
  4236. gopt::GraphOptimizer{}
  4237. .add_pass<gopt::PaddingChannelPass>()
  4238. .apply({{y2}})
  4239. .endpoint_vars(),
  4240. y2_pad);
  4241. ASSERT_EQ(y2_pad.node()->shape()[1], y2.node()->shape()[1]);
  4242. SmallVector<cg::OperatorNodeBase*> oprs;
  4243. auto cb = [&oprs](cg::OperatorNodeBase* opr) {
  4244. if (opr->same_type<opr::ConvBias>()) {
  4245. oprs.push_back(opr);
  4246. }
  4247. };
  4248. cg::DepOprIter{cb}.add(y2_pad.node()->owner_opr());
  4249. ASSERT_EQ(oprs.size(), 2);
  4250. ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32);
  4251. ASSERT_EQ(oprs[1]->output(0)->shape()[1], 32);
  4252. HostTensorND t1, t2;
  4253. auto func1 = graph->compile({make_callback_copy(y2, t1)});
  4254. func1->execute();
  4255. auto func2 = graph->compile({make_callback_copy(y2_pad, t2)});
  4256. func2->execute();
  4257. MGB_ASSERT_TENSOR_EQ(t1, t2);
  4258. }
  4259. TEST(TestGoptInference, PaddingChannelsWithPooling) {
  4260. REQUIRE_GPU(1);
  4261. auto cn = CompNode::load("gpu0");
  4262. cn.activate();
  4263. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  4264. HostTensorGenerator<dtype::Int8> gen;
  4265. auto graph = ComputingGraph::make();
  4266. graph->options().graph_opt_level = 0;
  4267. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4268. return opr::TypeCvt::make(
  4269. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  4270. };
  4271. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4272. return opr::TypeCvt::make(
  4273. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  4274. dtype);
  4275. };
  4276. auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)),
  4277. w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  4278. b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
  4279. opr::ConvBias::Param param;
  4280. param.format = opr::ConvBias::Param::Format::NCHW;
  4281. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  4282. param.stride_h = param.stride_w = 1;
  4283. param.pad_h = param.pad_w = 1;
  4284. auto y = opr::ConvBias::make(
  4285. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4286. auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)),
  4287. b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
  4288. auto y1 = opr::ConvBias::make(
  4289. y, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4290. opr::Pooling::Param pool_param;
  4291. pool_param.format = opr::Pooling::Param::Format::NCHW;
  4292. y1 = opr::Pooling::make(y1, pool_param);
  4293. y1 = opr::TypeCvt::make(y1, dtype::Float32());
  4294. SymbolVar y1_pad;
  4295. unpack_vector(
  4296. gopt::GraphOptimizer{}
  4297. .add_pass<gopt::PaddingChannelPass>()
  4298. .apply({{y1}})
  4299. .endpoint_vars(),
  4300. y1_pad);
  4301. ASSERT_EQ(y1_pad.node()->shape()[1], y1.node()->shape()[1]);
  4302. SmallVector<cg::OperatorNodeBase*> oprs;
  4303. auto cb = [&oprs](cg::OperatorNodeBase* opr) {
  4304. if (opr->same_type<opr::Pooling>()) {
  4305. oprs.push_back(opr);
  4306. }
  4307. };
  4308. cg::DepOprIter{cb}.add(y1_pad.node()->owner_opr());
  4309. ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32);
  4310. HostTensorND t1, t2;
  4311. auto func1 = graph->compile({make_callback_copy(y1, t1)});
  4312. func1->execute();
  4313. auto func2 = graph->compile({make_callback_copy(y1_pad, t2)});
  4314. func2->execute();
  4315. MGB_ASSERT_TENSOR_EQ(t1, t2);
  4316. }
  4317. // FIXME replace cpu with gpu to enable gpu validation
  4318. TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
  4319. auto cn = CompNode::load("cpu0");
  4320. HostTensorGenerator<dtype::Int8> gen;
  4321. auto graph = ComputingGraph::make();
  4322. graph->options().graph_opt_level = 0;
  4323. auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4324. return opr::TypeCvt::make(
  4325. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
  4326. };
  4327. auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
  4328. return opr::TypeCvt::make(
  4329. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
  4330. dtype);
  4331. };
  4332. std::shared_ptr<HostTensorND> mat =
  4333. std::make_shared<HostTensorND>(cn, TensorShape{16, 3, 3}, dtype::Float32());
  4334. warp_perspective_mat_gen(*mat, 16, 14, 14);
  4335. auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat");
  4336. auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)),
  4337. w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  4338. b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
  4339. opr::ConvBias::Param param;
  4340. param.format = opr::ConvBias::Param::Format::NCHW;
  4341. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  4342. param.stride_h = param.stride_w = 1;
  4343. param.pad_h = param.pad_w = 1;
  4344. auto y = opr::ConvBias::make(
  4345. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4346. auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)),
  4347. b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
  4348. auto y1 = opr::ConvBias::make(
  4349. y, w1, b1, param, {}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  4350. opr::WarpPerspective::Param warp_param;
  4351. warp_param.format = opr::WarpPerspective::Param::Format::NCHW;
  4352. y1 = opr::WarpPerspective::make(y1, mat_var, TensorShape{14, 14}, warp_param);
  4353. y1 = opr::TypeCvt::make(y1, dtype::Float32());
  4354. SymbolVar y1_pad;
  4355. unpack_vector(
  4356. gopt::GraphOptimizer{}
  4357. .add_pass<gopt::PaddingChannelPass>()
  4358. .apply({{y1}})
  4359. .endpoint_vars(),
  4360. y1_pad);
  4361. ASSERT_EQ(y1_pad.node()->shape()[1], y1.node()->shape()[1]);
  4362. SmallVector<cg::OperatorNodeBase*> oprs;
  4363. auto cb = [&oprs](cg::OperatorNodeBase* opr) {
  4364. if (opr->same_type<opr::WarpPerspective>()) {
  4365. oprs.push_back(opr);
  4366. }
  4367. };
  4368. cg::DepOprIter{cb}.add(y1_pad.node()->owner_opr());
  4369. ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32);
  4370. HostTensorND t1, t2;
  4371. auto func1 = graph->compile({make_callback_copy(y1, t1)});
  4372. func1->execute();
  4373. auto func2 = graph->compile({make_callback_copy(y1_pad, t2)});
  4374. func2->execute();
  4375. MGB_ASSERT_TENSOR_EQ(t1, t2);
  4376. }
  4377. #endif
  4378. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}