You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

inference.cpp 137 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364
  1. /**
  2. * \file src/gopt/test/inference.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/opr/dnn/local.h"
  13. #include "megbrain/test/helper.h"
  14. #include "megbrain/gopt/basic_arith.h"
  15. #include "megbrain/gopt/gtrans.h"
  16. #include "megbrain/gopt/inference.h"
  17. #include "megbrain/opr/basic_arith_wrapper.h"
  18. #include "megbrain/opr/blas.h"
  19. #include "megbrain/opr/dnn/batch_norm.h"
  20. #include "megbrain/opr/dnn/convolution.h"
  21. #include "megbrain/opr/dnn/pooling.h"
  22. #include "megbrain/opr/imgproc.h"
  23. #include "megbrain/opr/io.h"
  24. #include "megbrain/opr/nn_int.h"
  25. #include "megbrain/opr/tensor_gen.h"
  26. #include "megbrain/opr/tensor_manip.h"
  27. #include "megbrain/opr/utility.h"
  28. #include "./helper.h"
  29. #include "megbrain/comp_node_env.h"
  30. #include "megdnn/tensor_format.h"
  31. #include <random>
  32. using namespace mgb;
  33. namespace {
  34. //! find first the operator of specific type; raise exception if not found
  35. template <typename T>
  36. T& find_opr(SymbolVar endpoint) {
  37. T* found = nullptr;
  38. auto cb = [&found](cg::OperatorNodeBase* opr) {
  39. if (!found && opr->same_type<T>()) {
  40. found = &opr->cast_final_safe<T>();
  41. }
  42. };
  43. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  44. mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str());
  45. return *found;
  46. }
  47. template <typename T>
  48. T& find_opr(SymbolVar endpoint, const std::string& node_name) {
  49. T* found = nullptr;
  50. auto cb = [&found, &node_name](cg::OperatorNodeBase* opr) {
  51. if (!found && opr->same_type<T>() && opr->name() == node_name) {
  52. found = &opr->cast_final_safe<T>();
  53. }
  54. };
  55. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  56. mgb_assert(found, "not found opr %s from %s", node_name.c_str(),
  57. endpoint.node()->name().c_str());
  58. return *found;
  59. }
  60. template <typename T>
  61. size_t find_opr_num(SymbolVar endpoint) {
  62. size_t opr_num = 0;
  63. auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
  64. if (opr->same_type<T>()) {
  65. opr_num++;
  66. }
  67. };
  68. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  69. return opr_num;
  70. }
  71. class NaiveMegDNNHandleScope {
  72. int m_orig_level;
  73. public:
  74. NaiveMegDNNHandleScope()
  75. : m_orig_level{MegDNNHandle::exchange_default_dbg_level(2)} {
  76. CompNode::finalize();
  77. }
  78. ~NaiveMegDNNHandleScope() {
  79. auto set = MegDNNHandle::exchange_default_dbg_level(m_orig_level);
  80. mgb_assert(set == 2);
  81. CompNode::finalize();
  82. }
  83. };
  84. #if MGB_CUDA
  85. //! this function is only used in TestGoptInference.EnableCHWN4...
  86. void warp_perspective_mat_gen(HostTensorND& mat, size_t N, size_t INP_H,
  87. size_t INP_W) {
  88. static std::mt19937 rng(next_rand_seed());
  89. auto rand_real = [&](double lo, double hi) {
  90. return rng() / (std::mt19937::max() + 1.0) * (hi - lo) + lo;
  91. };
  92. auto rand_real2 = [&](double range) { return rand_real(-range, range); };
  93. auto ptr = mat.ptr<float>();
  94. for (size_t i = 0; i < N; ++i) {
  95. auto rot = rand_real(0, M_PI * 2), scale = rand_real(0.8, 1.2),
  96. sheer = rand_real(0.9, 1.1), dy = rand_real2(INP_H * 0.5),
  97. dx = rand_real2(INP_W * 0.5), ky = rand_real2(0.1 / INP_H),
  98. kx = rand_real2(0.1 / INP_W), kb = rand_real2(0.1) + 1;
  99. ptr[0] = ptr[4] = cos(rot) * scale;
  100. ptr[1] = -(ptr[3] = sin(rot) * scale);
  101. ptr[3] *= sheer;
  102. ptr[4] *= sheer;
  103. ptr[2] = dx;
  104. ptr[5] = dy;
  105. ptr[6] = kx;
  106. ptr[7] = ky;
  107. ptr[8] = kb;
  108. ptr += 9;
  109. }
  110. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  111. }
  112. #endif
  113. } // namespace
  114. TEST(TestGoptInference, ParamFuseConstEndPoint) {
  115. constexpr size_t SIZE = 23;
  116. HostTensorGenerator<> gen;
  117. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  118. auto graph = ComputingGraph::make();
  119. graph->options().graph_opt_level = 0;
  120. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  121. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  122. p = opr::Host2DeviceCopy::make(*graph, host_p), q = p + x, a = y + 3,
  123. z0 = a + q, z1 = a + 4;
  124. HostTensorND host_z0, host_z1;
  125. SymbolVar z0_1, z1_1;
  126. unpack_vector(gopt::GraphOptimizer{}
  127. .add_pass<gopt::ParamFusePass>()
  128. .apply({{z1, z0}})
  129. .endpoint_vars(),
  130. z1_1, z0_1);
  131. auto func = graph->compile({make_callback_copy(z0_1, host_z0),
  132. make_callback_copy(z1_1, host_z1)});
  133. func->to_json()->writeto_fpath(
  134. output_file("TestGoptInference.ParamFuseEndPoint.json"));
  135. func->execute();
  136. int nr_opr = 0;
  137. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  138. ++nr_opr;
  139. return true;
  140. });
  141. ASSERT_EQ(8, nr_opr);
  142. auto px = host_x->ptr<float>(), pz0 = host_z0.ptr<float>();
  143. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0],
  144. pz1 = host_z1.ptr<float>()[0];
  145. for (size_t i = 0; i < SIZE; ++i) {
  146. MGB_ASSERT_FLOAT_EQ(px[i] + yv + 3 + pv, pz0[i]);
  147. }
  148. MGB_ASSERT_FLOAT_EQ(yv + 7, pz1);
  149. }
  150. TEST(TestGoptInference, ParamFuse) {
  151. constexpr size_t SIZE = 23;
  152. HostTensorGenerator<> gen;
  153. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  154. auto graph = ComputingGraph::make();
  155. graph->options().graph_opt_level = 0;
  156. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  157. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  158. p = opr::Host2DeviceCopy::make(*graph, host_p),
  159. z = x + y, // endpoint
  160. q = x * y + p; // middle point
  161. SymbolVar z1, q1;
  162. unpack_vector(gopt::GraphOptimizer{}
  163. .add_pass<gopt::ParamFusePass>()
  164. .apply({{z, q}})
  165. .endpoint_vars(),
  166. z1, q1);
  167. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  168. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  169. ASSERT_EQ(q1.node()->owner_opr()->dyn_typeinfo(),
  170. q.node()->owner_opr()->dyn_typeinfo());
  171. HostTensorND host_z, host_q;
  172. auto func = graph->compile(
  173. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  174. func->execute();
  175. int nr_opr = 0;
  176. func->iter_opr_seq([&](cg::OperatorNodeBase*) {
  177. ++nr_opr;
  178. return true;
  179. });
  180. ASSERT_EQ(6, nr_opr);
  181. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(),
  182. pq = host_q.ptr<float>();
  183. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  184. for (size_t i = 0; i < SIZE; ++i) {
  185. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  186. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  187. }
  188. }
  189. TEST(TestGoptInference, ParamFuseMultiDeviceTensorHolder) {
  190. constexpr size_t SIZE = 23;
  191. HostTensorGenerator<> gen;
  192. auto host_x = gen({SIZE}), host_y = gen({1}), host_p = gen({1});
  193. auto graph = ComputingGraph::make();
  194. graph->options().graph_opt_level = 0;
  195. auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
  196. y = opr::SharedDeviceTensor::make(*graph, *host_y),
  197. p = opr::Host2DeviceCopy::make(*graph, host_p),
  198. z = x + y, //! endpoint
  199. q = x * y + p; //! middle point
  200. SymbolVar z1, q1;
  201. unpack_vector(gopt::GraphOptimizer{}
  202. .add_pass<gopt::ParamMergePass>()
  203. .apply({{z}})
  204. .endpoint_vars(),
  205. z1);
  206. ASSERT_TRUE(z1.node()
  207. ->owner_opr()
  208. ->input(0)
  209. ->owner_opr()
  210. ->same_type<opr::MultipleDeviceTensorHolder>());
  211. unpack_vector(gopt::GraphOptimizer{}
  212. .add_pass<gopt::ParamMergePass>()
  213. .add_pass<gopt::ParamFusePass>()
  214. .apply({{z, q}})
  215. .endpoint_vars(),
  216. z1, q1);
  217. ASSERT_TRUE(z1.node()->owner_opr()->same_type<opr::SharedDeviceTensor>());
  218. ASSERT_NE(q1.node()->owner_opr(), q.node()->owner_opr());
  219. ASSERT_EQ(q1.node()->owner_opr()->dyn_typeinfo(),
  220. q.node()->owner_opr()->dyn_typeinfo());
  221. HostTensorND host_z, host_q;
  222. auto func = graph->compile(
  223. {make_callback_copy(z1, host_z), make_callback_copy(q1, host_q)});
  224. func->execute();
  225. int nr_opr = 0;
  226. func->iter_opr_seq([&](cg::OperatorNodeBase* op) {
  227. ++nr_opr;
  228. return true;
  229. });
  230. ASSERT_EQ(6, nr_opr);
  231. auto px = host_x->ptr<float>(), pz = host_z.ptr<float>(),
  232. pq = host_q.ptr<float>();
  233. auto yv = host_y->ptr<float>()[0], pv = host_p->ptr<float>()[0];
  234. for (size_t i = 0; i < SIZE; ++i) {
  235. MGB_ASSERT_FLOAT_EQ(px[i] + yv, pz[i]);
  236. MGB_ASSERT_FLOAT_EQ(px[i] * yv + pv, pq[i]);
  237. }
  238. }
  239. TEST(TestGoptInference, ParamFuseMultiRead) {
  240. HostTensorGenerator<> gen;
  241. auto graph = ComputingGraph::make();
  242. graph->options().graph_opt_level = 0;
  243. auto mkvar = [&](const char* name, const TensorShape& shp) {
  244. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  245. };
  246. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  247. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  248. };
  249. auto x = mkvar("x", {23}), p0 = mkcvar("p0", {1}), p1 = mkcvar("p1", {1}),
  250. z0 = x * (p0 + p1) + x / (p0 + p1);
  251. SymbolVar z1;
  252. unpack_vector(gopt::GraphOptimizer{}
  253. .add_pass<gopt::ParamFusePass>()
  254. .apply({{z0}})
  255. .endpoint_vars(),
  256. z1);
  257. ASSERT_NE(z0.node(), z1.node());
  258. ASSERT_TRUE(z1.node()
  259. ->owner_opr()
  260. ->input(0)
  261. ->owner_opr()
  262. ->input(1)
  263. ->owner_opr()
  264. ->same_type<opr::SharedDeviceTensor>());
  265. ASSERT_TRUE(z1.node()
  266. ->owner_opr()
  267. ->input(1)
  268. ->owner_opr()
  269. ->input(1)
  270. ->owner_opr()
  271. ->same_type<opr::SharedDeviceTensor>());
  272. HostTensorND host_z0, host_z1;
  273. graph->compile({make_callback_copy(z0, host_z0),
  274. make_callback_copy(z1, host_z1)})
  275. ->execute();
  276. MGB_ASSERT_TENSOR_EQ(host_z0, host_z1);
  277. }
  278. TEST(TestGoptInference, ParamFuseStaticInfer) {
  279. HostTensorGenerator<> gen;
  280. auto graph = ComputingGraph::make();
  281. auto mkvar = [&](const char* name, const TensorShape& shp) {
  282. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  283. };
  284. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  285. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  286. };
  287. auto a = mkvar("x", {4}),
  288. b = a.reshape(opr::GetVarShape::make(mkcvar("tshp", {2, 2})));
  289. SymbolVar b1;
  290. unpack_vector(gopt::GraphOptimizer{}
  291. .add_pass<gopt::ParamFusePass>()
  292. .apply({{b}})
  293. .endpoint_vars(),
  294. b1);
  295. ASSERT_EQ(b1, a.reshape({2, 2}));
  296. }
  297. TEST(TestGoptInference, ParamRedistributeConvMul) {
  298. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  299. HostTensorGenerator<> gen;
  300. auto host_x = gen({N, IC, IH, IW}), host_k = gen({IC}),
  301. host_w = gen({OC, IC, KH, KW});
  302. auto graph = ComputingGraph::make();
  303. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  304. k = opr::Dimshuffle::make(
  305. opr::SharedDeviceTensor::make(*graph, *host_k),
  306. {-1, 0, -1, -1}),
  307. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  308. y0 = opr::Convolution::make(x * k, w);
  309. SymbolVar y1;
  310. unpack_vector(gopt::GraphOptimizer{}
  311. .add_pass<gopt::ParamRedistributePass>()
  312. .apply({{y0}})
  313. .endpoint_vars(),
  314. y1);
  315. ASSERT_NE(y0.node(), y1.node());
  316. HostTensorND host_y0, host_y1;
  317. auto func = graph->compile(
  318. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  319. func->execute();
  320. MGB_ASSERT_TENSOR_EQ(host_y0, host_y1);
  321. }
  322. TEST(TestGoptInference, ParamRedistributeConvMulUniqReader) {
  323. constexpr size_t N = 4, C = 3, IH = 5, IW = 4, KH = 1, KW = 1;
  324. HostTensorGenerator<> gen;
  325. auto host_x = gen({N, C, IH, IW}), host_k = gen({C}),
  326. host_w = gen({C, C, KH, KW});
  327. auto graph = ComputingGraph::make();
  328. graph->options().graph_opt_level = 0;
  329. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  330. k = opr::Dimshuffle::make(
  331. opr::SharedDeviceTensor::make(*graph, *host_k) + 2,
  332. {-1, 0, -1, -1}),
  333. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  334. // y0 should be replaced
  335. y0 = opr::powf(opr::Convolution::make(x * k, w).rename("y0") + 2,
  336. 2),
  337. y0k = (y0 * k).rename("y0k"),
  338. // y0k is accessed twice, so it should not be replaced
  339. y1 = opr::Convolution::make(y0k, w).rename("y1"), z0 = y1 / y0k;
  340. SymbolVar z1;
  341. unpack_vector(gopt::GraphOptimizer{}
  342. .add_pass<gopt::ParamRedistributePass>()
  343. .apply({{z0}})
  344. .endpoint_vars(),
  345. z1);
  346. ASSERT_NE(z0.node(), z1.node());
  347. auto y1_repl = z1.node()->owner_opr()->input(0)->owner_opr();
  348. ASSERT_TRUE(y1_repl->same_type<opr::Convolution>());
  349. ASSERT_EQ(y1_repl->input(0), z1.node()->owner_opr()->input(1));
  350. HostTensorND host_z0, host_z1;
  351. auto func = graph->compile(
  352. {make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)});
  353. func->execute();
  354. MGB_ASSERT_TENSOR_NEAR(host_z0, host_z1, 5e-5);
  355. }
  356. TEST(TestGoptInference, ParamRedistributeMulConvMul) {
  357. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  358. HostTensorGenerator<> gen;
  359. auto host_x = gen({N, IC, IH, IW}), host_k1 = gen({IC}),
  360. host_k2 = gen({1, OC, 1, 1}), host_w = gen({OC, IC, KH, KW});
  361. auto graph = ComputingGraph::make();
  362. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  363. k1 = opr::Dimshuffle::make(
  364. opr::SharedDeviceTensor::make(*graph, *host_k1),
  365. {-1, 0, -1, -1}),
  366. k2 = opr::SharedDeviceTensor::make(*graph, *host_k2),
  367. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  368. y0 = opr::Convolution::make(x * k1, w) * k2;
  369. SymbolVar y1;
  370. unpack_vector(gopt::GraphOptimizer{}
  371. .add_pass<gopt::ParamRedistributePass>()
  372. .add_pass<gopt::ParamFusePass>()
  373. .apply({{y0}})
  374. .endpoint_vars(),
  375. y1);
  376. auto y1opr = y1.node()->owner_opr();
  377. ASSERT_TRUE(y1opr->same_type<opr::Convolution>());
  378. ASSERT_EQ(y1opr->input(0), x.node());
  379. HostTensorND host_y0, host_y1;
  380. auto func = graph->compile(
  381. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  382. func->execute();
  383. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 5e-6);
  384. }
  385. TEST(TestGoptInference, ParamRedistributeConvAdd) {
  386. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  387. HostTensorGenerator<> gen;
  388. auto host_x = gen({N, IC, IH, IW}), host_b = gen({IC}),
  389. host_w = gen({OC, IC, KH, KW});
  390. auto graph = ComputingGraph::make();
  391. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  392. b = opr::Dimshuffle::make(
  393. opr::SharedDeviceTensor::make(*graph, *host_b),
  394. {-1, 0, -1, -1}),
  395. w = opr::SharedDeviceTensor::make(*graph, *host_w),
  396. y0 = opr::Convolution::make(x + b, w);
  397. SymbolVar y1;
  398. unpack_vector(gopt::GraphOptimizer{}
  399. .add_pass<gopt::ParamRedistributePass>()
  400. .add_pass<gopt::ParamFusePass>()
  401. .apply({{y0}})
  402. .endpoint_vars(),
  403. y1);
  404. ASSERT_NE(y0.node(), y1.node());
  405. HostTensorND host_y0, host_y1;
  406. auto func = graph->compile(
  407. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  408. func->execute();
  409. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  410. }
  411. TEST(TestGoptInference, ParamRedistributeDistThenReasso) {
  412. constexpr size_t N = 4, IC0 = 3, IC1 = 6, IH = 5, IW = 4, OC = 4, KH = 3,
  413. KW = 2;
  414. HostTensorGenerator<> gen;
  415. auto graph = ComputingGraph::make();
  416. auto mkvar = [&](const char* name, const TensorShape& shp) {
  417. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  418. };
  419. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  420. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  421. };
  422. auto x0 = mkvar("x0", {N, IC0, IH, IW}), x1 = mkvar("x1", {N, IC1, IH, IW}),
  423. k0 = opr::Dimshuffle::make(mkcvar("x1_", {IC0}), {-1, 0, -1, -1})
  424. .rename("x1"),
  425. w0 = mkcvar("w0", {OC, IC0, KH, KW}),
  426. k1 = mkcvar("k1", {1, IC1, 1, 1}),
  427. w1 = mkcvar("w1", {OC, IC1, KH, KW}), b0 = mkvar("b0", {1, OC, 1, 1}),
  428. b1 = mkcvar("b1", {1}), k2 = mkcvar("k2", {1}),
  429. y0 = (opr::Convolution::make(x0 * k0, w0) +
  430. opr::Convolution::make(x1 + k1, w1) + b0 + b1) *
  431. k2;
  432. SymbolVar y1;
  433. unpack_vector(gopt::GraphOptimizer{}
  434. .add_pass<gopt::ParamRedistributePass>()
  435. .add_pass<gopt::ReorderArithChainPass>(
  436. gopt::ConstVarType::IMMUTABLE_AND_PARAM)
  437. .add_pass<gopt::ParamFusePass>()
  438. .apply({{y0}})
  439. .endpoint_vars(),
  440. y1);
  441. ASSERT_NE(y0.node(), y1.node());
  442. HostTensorND host_y0, host_y1;
  443. auto func = graph->compile(
  444. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  445. func->execute();
  446. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  447. auto chain =
  448. gopt::extract_opr_leaves(y1.node(), [](cg::OperatorNodeBase* opr) {
  449. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  450. });
  451. size_t nr_conv = 0;
  452. for (auto i : chain) {
  453. auto opr = i->owner_opr();
  454. if (opr->same_type<opr::Convolution>()) {
  455. ++nr_conv;
  456. ASSERT_TRUE(opr->input(0)
  457. ->owner_opr()
  458. ->same_type<opr::Host2DeviceCopy>());
  459. ASSERT_TRUE(opr->input(1)
  460. ->owner_opr()
  461. ->same_type<opr::SharedDeviceTensor>());
  462. }
  463. }
  464. ASSERT_EQ(2u, nr_conv);
  465. ASSERT_EQ(4u, chain.size());
  466. }
  467. TEST(TestGoptInference, ParamRedistributeMultiChange) {
  468. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  469. HostTensorGenerator<> gen;
  470. auto graph = ComputingGraph::make();
  471. graph->options().graph_opt_level = 0;
  472. auto mkvar = [&](const char* name, const TensorShape& shp) {
  473. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  474. };
  475. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  476. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  477. };
  478. auto x = mkvar("x", {N, IC, IH, IW}), k0 = mkcvar("k0", {1, IC, 1, 1}),
  479. b0 = mkcvar("b0", {1, IC, 1, 1}), k1 = mkcvar("k0", {1}),
  480. b1 = mkcvar("b0", {1}), w = mkcvar("w", {OC, IC, KH, KW}),
  481. y0 = (opr::Convolution::make(x * k0 + b0, w) + b1) * k1;
  482. SymbolVar y1;
  483. unpack_vector(gopt::GraphOptimizer{}
  484. .add_pass<gopt::ParamRedistributePass>()
  485. .add_pass<gopt::ParamFusePass>()
  486. .apply({{y0}})
  487. .endpoint_vars(),
  488. y1);
  489. ASSERT_NE(y0.node(), y1.node());
  490. HostTensorND host_y0, host_y1;
  491. auto func = graph->compile(
  492. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  493. func->execute();
  494. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  495. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  496. ASSERT_TRUE(y1elem);
  497. auto yconv = y1elem->input(0)->owner_opr();
  498. if (!yconv->same_type<opr::Convolution>())
  499. yconv = y1elem->input(1)->owner_opr();
  500. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  501. ASSERT_EQ(x.node(), yconv->input(0));
  502. }
  503. TEST(TestGoptInference, ParamRedistributeMultiReader) {
  504. constexpr size_t N = 4, IC = 3, IH = 5, IW = 4, OC = 4, KH = 3, KW = 2;
  505. HostTensorGenerator<> gen;
  506. auto graph = ComputingGraph::make();
  507. graph->options().graph_opt_level = 0;
  508. auto mkvar = [&](const char* name, const TensorShape& shp) {
  509. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  510. };
  511. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  512. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  513. };
  514. auto x = mkvar("x", {N, IC, IH, IW}), k = mkcvar("k", {1, OC, 1, 1}),
  515. w = mkcvar("w", {OC, IC, KH, KW});
  516. auto conv = opr::Convolution::make(x, w);
  517. auto t = conv * k;
  518. auto y0 = t * 4.2f + t * 2.4f;
  519. SymbolVar y1;
  520. unpack_vector(gopt::GraphOptimizer{}
  521. .add_pass<gopt::ParamRedistributePass>()
  522. .add_pass<gopt::ParamFusePass>()
  523. .apply({{y0}})
  524. .endpoint_vars(),
  525. y1);
  526. ASSERT_NE(y0.node(), y1.node());
  527. HostTensorND host_y0, host_y1;
  528. auto func = graph->compile(
  529. {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});
  530. func->execute();
  531. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  532. auto y1elem = gopt::as_elem_opr(y1.node(), opr::Elemwise::Mode::ADD);
  533. ASSERT_TRUE(y1elem);
  534. auto ymul0 = gopt::as_elem_opr(y1elem->input(0), opr::Elemwise::Mode::MUL),
  535. ymul1 = gopt::as_elem_opr(y1elem->input(1), opr::Elemwise::Mode::MUL);
  536. ASSERT_TRUE(ymul0);
  537. ASSERT_TRUE(ymul1);
  538. auto yconv = ymul0->input(0)->owner_opr();
  539. if (!yconv->same_type<opr::Convolution>()) {
  540. yconv = ymul0->input(1)->owner_opr();
  541. }
  542. ASSERT_TRUE(yconv->same_type<opr::Convolution>());
  543. if (ymul1->input(0) != yconv->output(0)) {
  544. ASSERT_EQ(yconv->output(0), ymul1->input(1));
  545. }
  546. ASSERT_EQ(x.node(), yconv->input(0));
  547. }
  548. TEST(TestGoptInference, ParamFuseBiasMerge) {
  549. HostTensorGenerator<> gen;
  550. auto graph = ComputingGraph::make();
  551. graph->options().graph_opt_level = 0;
  552. auto mkvar = [&](const char* name, const TensorShape& shp) {
  553. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  554. };
  555. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  556. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  557. };
  558. auto x = mkvar("x", {6, 3, 8, 8}), w1 = mkcvar("w1", {4, 3, 3, 3}),
  559. w2 = mkcvar("w2", {4, 3, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  560. b2 = mkcvar("b2", {1, 4, 1, 1}),
  561. y1 = opr::Convolution::make(x, w1) + b1,
  562. y2 = opr::Convolution::make(x, w2) + b2, y = y1 + y2;
  563. SymbolVar y_opt;
  564. unpack_vector(gopt::optimize_for_inference({y}), y_opt);
  565. HostTensorND host_y, host_y_opt;
  566. auto func = graph->compile({make_callback_copy(y, host_y),
  567. make_callback_copy(y_opt, host_y_opt)});
  568. func->execute();
  569. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  570. graph->compile({{y_opt, {}}})
  571. ->to_json()
  572. ->writeto_fpath(
  573. output_file("TestGoptInference.ParamFuseConvMerge.json"));
  574. auto chain = gopt::extract_opr_leaves(
  575. y_opt.node(), [](cg::OperatorNodeBase* opr) {
  576. return gopt::as_elem_opr(opr, opr::Elemwise::Mode::ADD);
  577. });
  578. ASSERT_EQ(3u, chain.size());
  579. }
  580. TEST(TestGoptInference, Float16IOFloat32Compute) {
  581. constexpr size_t INP_H = 10, INP_W = 10;
  582. HostTensorGenerator<> gen;
  583. auto graph = ComputingGraph::make();
  584. auto mkvar = [&](const char* name, const TensorShape& shp) {
  585. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  586. };
  587. graph->options().graph_opt_level = 0;
  588. auto a = mkvar("a", {1, 4, INP_H, INP_W}),
  589. s0 = mkvar("s0", {20, 3, INP_H, INP_W}),
  590. s1 = mkvar("s1", {4, 3, 1, 1});
  591. auto b = opr::Convolution::make(s0, s1, {}, {});
  592. auto y = a + b;
  593. y = opr::Concat::make({y, -y}, 0);
  594. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  595. SymbolVar y_opt;
  596. auto options = gopt::OptimizeForInferenceOptions{};
  597. options.enable_f16_io_f32_comp();
  598. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  599. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  600. HostTensorND host_y, host_y_opt;
  601. auto func = graph->compile({make_callback_copy(y, host_y),
  602. make_callback_copy(y_opt, host_y_opt)});
  603. func->execute();
  604. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  605. }
  606. TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) {
  607. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  608. HostTensorGenerator<> gen;
  609. auto graph = ComputingGraph::make();
  610. auto mkvar = [&](const char* name, const TensorShape& shp) {
  611. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  612. };
  613. graph->options().graph_opt_level = 0;
  614. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  615. float value1 = M_PI, value2 = 0.6;
  616. auto gen_mat = [&](HostTensorND& mat) {
  617. auto ptr = mat.ptr<float>();
  618. for (size_t i = 0; i < N; ++i) {
  619. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  620. dx = value2, ky = value2, kx = value2, kb = value2;
  621. ptr[0] = ptr[4] = cos(rot) * scale;
  622. ptr[1] = -(ptr[3] = sin(rot) * scale);
  623. ptr[3] *= sheer;
  624. ptr[4] *= sheer;
  625. ptr[2] = dx;
  626. ptr[5] = dy;
  627. ptr[6] = kx;
  628. ptr[7] = ky;
  629. ptr[8] = kb;
  630. ptr += 9;
  631. }
  632. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  633. };
  634. auto mat_host = std::make_shared<HostTensorND>(
  635. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  636. gen_mat(*mat_host);
  637. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  638. TensorShape out_shp{20, 20};
  639. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  640. SymbolVar y_opt;
  641. auto options = gopt::OptimizeForInferenceOptions{};
  642. options.enable_f16_io_f32_comp();
  643. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  644. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  645. HostTensorND host_y, host_y_opt;
  646. auto func = graph->compile({make_callback_copy(y, host_y),
  647. make_callback_copy(y_opt, host_y_opt)});
  648. func->execute();
  649. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  650. }
  651. TEST(TestGoptInference, Float16IOFloat32ComputeRemap) {
  652. auto cn = CompNode::load("cpu1");
  653. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  654. HostTensorGenerator<> gen;
  655. auto graph = ComputingGraph::make();
  656. auto mkvar = [&](const char* name, const TensorShape& shp) {
  657. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  658. };
  659. graph->options().graph_opt_level = 0;
  660. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  661. auto gen_map = [&](HostTensorND& mat) {
  662. auto ptr = mat.ptr<float>();
  663. for (size_t n = 0; n < N; ++n) {
  664. for (int h = 0; h < 5; ++h) {
  665. for (int w = 0; w < 5; ++w) {
  666. *ptr++ = (h * 5 * 2) + 5 * 2 + 0;
  667. *ptr++ = (h * 5 * 2) + 5 * 2 + 1;
  668. }
  669. }
  670. }
  671. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  672. };
  673. auto map_host = std::make_shared<HostTensorND>(
  674. a.node()->comp_node(), TensorShape{N, 5, 5, 2}, dtype::Float32());
  675. gen_map(*map_host);
  676. auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map");
  677. auto y = opr::Remap::make(a, map);
  678. SymbolVar y_opt;
  679. auto options = gopt::OptimizeForInferenceOptions{};
  680. options.enable_f16_io_f32_comp();
  681. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  682. ASSERT_EQ(y_opt.dtype(), dtype::Float32());
  683. HostTensorND host_y, host_y_opt;
  684. auto func = graph->compile({make_callback_copy(y, host_y),
  685. make_callback_copy(y_opt, host_y_opt)});
  686. func->execute();
  687. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  688. }
  689. TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) {
  690. constexpr size_t INP_H = 10, INP_W = 10, N = 2;
  691. HostTensorGenerator<dtype::Uint8> gen_uint8;
  692. auto graph = ComputingGraph::make();
  693. auto mkvar = [&](const char* name, const TensorShape& shp) {
  694. return opr::Host2DeviceCopy::make(*graph, gen_uint8(shp)).rename(name);
  695. };
  696. graph->options().graph_opt_level = 0;
  697. auto a = mkvar("a", {N, 4, INP_H, INP_W});
  698. float value1 = M_PI, value2 = 0.6;
  699. auto gen_mat = [&](HostTensorND& mat) {
  700. auto ptr = mat.ptr<float>();
  701. for (size_t i = 0; i < N; ++i) {
  702. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  703. dx = value2, ky = value2, kx = value2, kb = value2;
  704. ptr[0] = ptr[4] = cos(rot) * scale;
  705. ptr[1] = -(ptr[3] = sin(rot) * scale);
  706. ptr[3] *= sheer;
  707. ptr[4] *= sheer;
  708. ptr[2] = dx;
  709. ptr[5] = dy;
  710. ptr[6] = kx;
  711. ptr[7] = ky;
  712. ptr[8] = kb;
  713. ptr += 9;
  714. }
  715. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  716. };
  717. auto mat_host = std::make_shared<HostTensorND>(
  718. a.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  719. gen_mat(*mat_host);
  720. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  721. TensorShape out_shp{20, 20};
  722. auto y = opr::WarpPerspective::make(a, mat, out_shp);
  723. SymbolVar y_opt;
  724. auto options = gopt::OptimizeForInferenceOptions{};
  725. options.enable_f16_io_comp();
  726. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  727. ASSERT_EQ(y_opt.dtype(), dtype::Uint8());
  728. HostTensorND host_y, host_y_opt;
  729. auto func = graph->compile({make_callback_copy(y, host_y),
  730. make_callback_copy(y_opt, host_y_opt)});
  731. func->execute();
  732. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  733. }
  734. TEST(TestGoptInference, Float32TOFloat16) {
  735. CompNode cn = CompNode::load("cpu0");
  736. HostTensorGenerator<> gen(0, 1, 0);
  737. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  738. host_x2 = gen({4, 3, 1, 1}, cn);
  739. auto graph = ComputingGraph::make();
  740. auto make_f32_to_f16_graph = [&]() {
  741. graph->options().graph_opt_level = 0;
  742. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  743. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  744. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  745. auto b = opr::Convolution::make(d1, d2, {}, {});
  746. auto y = d0 + b;
  747. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  748. SymbolVar y_opt;
  749. auto options = gopt::OptimizeForInferenceOptions{};
  750. options.enable_f16_io_comp();
  751. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  752. return y_opt;
  753. };
  754. auto make_f16_graph = [&]() {
  755. auto d0 = opr::TypeCvt::make(
  756. opr::Host2DeviceCopy::make(*graph, host_x0),
  757. dtype::Float16{}),
  758. d1 = opr::TypeCvt::make(
  759. opr::Host2DeviceCopy::make(*graph, host_x1),
  760. dtype::Float16{}),
  761. d2 = opr::TypeCvt::make(
  762. opr::SharedDeviceTensor::make(*graph, *host_x2),
  763. dtype::Float16{});
  764. auto b = opr::Convolution::make(d1, d2, {}, {});
  765. SymbolVar y = d0 + b;
  766. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  767. y = opr::TypeCvt::make(y, dtype::Float32{});
  768. return y;
  769. };
  770. auto y_opt = make_f32_to_f16_graph();
  771. auto y = make_f16_graph();
  772. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  773. ASSERT_EQ(y.dtype(), dtype::Float32{});
  774. HostTensorND host_y_opt, host_y;
  775. auto func = graph->compile({make_callback_copy(y, host_y),
  776. make_callback_copy(y_opt, host_y_opt)});
  777. func->execute();
  778. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  779. }
  780. TEST(TestGoptInference, Float32TOFloat16C32) {
  781. CompNode cn = CompNode::load("cpu0");
  782. HostTensorGenerator<> gen(0, 1, 0);
  783. auto host_x0 = gen({1, 4, 1, 1}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  784. host_x2 = gen({4, 3, 1, 1}, cn);
  785. auto graph = ComputingGraph::make();
  786. auto make_f32_to_f16_graph = [&]() {
  787. graph->options().graph_opt_level = 0;
  788. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  789. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  790. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  791. auto y = opr::ConvBias::make(d1, d2, d0);
  792. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  793. SymbolVar y_opt;
  794. auto options = gopt::OptimizeForInferenceOptions{};
  795. options.enable_f16_io_f32_comp();
  796. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  797. return y_opt;
  798. };
  799. auto make_f16_graph = [&]() {
  800. auto d0 = opr::TypeCvt::make(
  801. opr::TypeCvt::make(
  802. opr::Host2DeviceCopy::make(*graph, host_x0),
  803. dtype::Float16{}),
  804. dtype::Float32{}),
  805. d1 = opr::TypeCvt::make(
  806. opr::TypeCvt::make(
  807. opr::Host2DeviceCopy::make(*graph, host_x1),
  808. dtype::Float16{}),
  809. dtype::Float32{}),
  810. d2 = opr::TypeCvt::make(
  811. opr::TypeCvt::make(
  812. opr::SharedDeviceTensor::make(*graph, *host_x2),
  813. dtype::Float16{}),
  814. dtype::Float32{});
  815. auto y = opr::ConvBias::make(d1, d2, d0);
  816. y = opr::Reduce::make(y, {}, y.make_scalar(1));
  817. y = opr::TypeCvt::make(opr::TypeCvt::make(y, dtype::Float16{}),
  818. dtype::Float32{});
  819. return y;
  820. };
  821. auto y_opt = make_f32_to_f16_graph();
  822. auto y = make_f16_graph();
  823. ASSERT_EQ(find_opr<opr::ConvBias>(y_opt).param().compute_mode,
  824. opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
  825. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  826. ASSERT_EQ(y.dtype(), dtype::Float32{});
  827. HostTensorND host_y_opt, host_y;
  828. auto func = graph->compile({make_callback_copy(y, host_y),
  829. make_callback_copy(y_opt, host_y_opt)});
  830. func->execute();
  831. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  832. }
  833. TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
  834. CompNode cn = CompNode::load("cpu0");
  835. HostTensorGenerator<> gen(0, 1, 0);
  836. auto host_x0 = gen({1, 4, 16, 8}, cn), host_x1 = gen({2, 3, 16, 8}, cn),
  837. host_x2 = gen({4, 3, 1, 1}, cn);
  838. auto graph = ComputingGraph::make();
  839. auto make_f32_to_f16_graph = [&]() {
  840. graph->options().graph_opt_level = 0;
  841. auto d0 = opr::Host2DeviceCopy::make(*graph, host_x0),
  842. d1 = opr::Host2DeviceCopy::make(*graph, host_x1),
  843. d2 = opr::SharedDeviceTensor::make(*graph, *host_x2);
  844. auto b = opr::Convolution::make(d1, d2, {}, {});
  845. auto y = d0 + b;
  846. SymbolVar y_opt;
  847. auto options = gopt::OptimizeForInferenceOptions{};
  848. options.enable_f16_io_comp();
  849. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  850. return y_opt;
  851. };
  852. auto make_f16_graph = [&]() {
  853. auto d0 = opr::TypeCvt::make(
  854. opr::Host2DeviceCopy::make(*graph, host_x0),
  855. dtype::Float16{}),
  856. d1 = opr::TypeCvt::make(
  857. opr::Host2DeviceCopy::make(*graph, host_x1),
  858. dtype::Float16{}),
  859. d2 = opr::TypeCvt::make(
  860. opr::SharedDeviceTensor::make(*graph, *host_x2),
  861. dtype::Float16{});
  862. auto b = opr::Convolution::make(d1, d2, {}, {});
  863. SymbolVar y = d0 + b;
  864. y = opr::TypeCvt::make(y, dtype::Float32{});
  865. return y;
  866. };
  867. auto y_opt = make_f32_to_f16_graph();
  868. auto y = make_f16_graph();
  869. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  870. ASSERT_EQ(y.dtype(), dtype::Float32{});
  871. HostTensorND host_y_opt, host_y;
  872. auto func = graph->compile({make_callback_copy(y, host_y),
  873. make_callback_copy(y_opt, host_y_opt)});
  874. func->execute();
  875. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  876. }
  877. TEST(TestGoptInference, Float32TOFloat16Linspace) {
  878. CompNode cn = CompNode::load("cpu0");
  879. HostTensorGenerator<> gen(0, 1, 0);
  880. auto host_x = gen({3, 1}, cn);
  881. auto graph = ComputingGraph::make();
  882. auto make_f32_to_f16_graph = [&]() {
  883. graph->options().graph_opt_level = 0;
  884. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  885. auto xshp = opr::GetVarShape::make(x);
  886. auto cv = [&x](int v) { return x.make_scalar(v); };
  887. auto sub = [&xshp, &cv](int idx) {
  888. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  889. };
  890. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  891. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  892. auto y = opr::Reshape::make(lin, shp);
  893. auto mm = opr::MatrixMul::make(x, y);
  894. SymbolVar mm_opt;
  895. auto options = gopt::OptimizeForInferenceOptions{};
  896. options.enable_f16_io_comp();
  897. unpack_vector(gopt::optimize_for_inference({mm}, options), mm_opt);
  898. return mm_opt;
  899. };
  900. auto make_f16_graph = [&]() {
  901. auto x = opr::TypeCvt::make(opr::Host2DeviceCopy::make(*graph, host_x),
  902. dtype::Float16());
  903. auto xshp = opr::GetVarShape::make(x);
  904. auto cv = [&x](int v) { return x.make_scalar(v); };
  905. auto sub = [&xshp, &cv](int idx) {
  906. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  907. };
  908. auto lin = opr::Linspace::make(cv(0), sub(0) - 1, sub(0), {}, {});
  909. lin = opr::TypeCvt::make(lin, dtype::Float16());
  910. auto shp = opr::Concat::make({sub(1), sub(0)}, 0);
  911. auto y = opr::Reshape::make(lin, shp);
  912. auto mm = opr::MatrixMul::make(x, y);
  913. mm = opr::TypeCvt::make(mm, dtype::Float32{});
  914. return mm;
  915. };
  916. auto y_opt = make_f32_to_f16_graph();
  917. auto y = make_f16_graph();
  918. ASSERT_EQ(y_opt.dtype(), dtype::Float32{});
  919. ASSERT_EQ(y.dtype(), dtype::Float32{});
  920. HostTensorND host_y_opt, host_y;
  921. auto func = graph->compile({make_callback_copy(y, host_y),
  922. make_callback_copy(y_opt, host_y_opt)});
  923. func->execute();
  924. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  925. }
  926. TEST(TestGoptInference, Float32TOFloat16Endpoints) {
  927. HostTensorGenerator<> gen;
  928. auto graph = ComputingGraph::make();
  929. auto mkvar = [&](const char* name, const TensorShape& shp) {
  930. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  931. };
  932. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  933. return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
  934. };
  935. graph->options().graph_opt_level = 0;
  936. opr::Convolution::Param param;
  937. param.pad_h = param.pad_w = 0;
  938. auto x = mkvar("x", {8, 8, 8, 8}), y = mkvar("y", {8, 8, 8, 8}),
  939. w = mkcvar("w", {4, 8, 3, 3}),
  940. z = opr::Convolution::make(x + y, w, param);
  941. auto options = gopt::OptimizeForInferenceOptions{};
  942. options.enable_f16_io_f32_comp();
  943. SymbolVarArray out = gopt::optimize_for_inference({x + y, z}, options);
  944. ASSERT_EQ(out[0].dtype(), dtype::Float32());
  945. ASSERT_EQ(out[1].dtype(), dtype::Float32());
  946. ASSERT_EQ(out[0].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  947. ASSERT_EQ(out[1].node()->owner_opr()->input(0)->dtype(), dtype::Float16());
  948. }
  949. TEST(TestGoptInference, ConvertFormatNHWCD4) {
  950. // hwcd4 is only supported in naive handle
  951. NaiveMegDNNHandleScope naive_megdnn_handle;
  952. HostTensorGenerator<> gen;
  953. auto cn = CompNode::load("cpu0");
  954. auto graph = ComputingGraph::make();
  955. graph->options().graph_opt_level = 0;
  956. auto mkvar = [&](const char* name, const TensorShape& shp) {
  957. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  958. };
  959. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  960. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  961. .rename(name);
  962. };
  963. auto host_x = gen({8, 8, 8, 8}, cn);
  964. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  965. opr::Convolution::Param param;
  966. param.pad_h = param.pad_w = 0;
  967. auto w1 = mkcvar("w1", {4, 8, 3, 3}),
  968. conv = opr::Convolution::make(x, w1, param);
  969. auto shape_of = opr::GetVarShape::make(conv);
  970. auto subtensor = opr::Subtensor::make(
  971. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  972. 0, x.make_scalar(2), None, x.make_scalar(1))});
  973. opr::Resize::Param param_resize;
  974. param_resize.format = opr::Resize::Param::Format::NCHW;
  975. auto resize = opr::ResizeForward::make(conv, subtensor * 2, param_resize);
  976. auto mat = mkcvar("mat", {8, 3, 3}),
  977. warp = opr::WarpPerspectiveForward::make(
  978. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  979. auto b = mkvar("b", {1, 4, 1, 1}),
  980. elem = opr::Elemwise::make({warp + b},
  981. opr::Elemwise::Param::Mode::RELU);
  982. param.pad_h = param.pad_w = 1;
  983. auto w2 = mkcvar("w2", {4, 4, 3, 3}),
  984. y = opr::Convolution::make(elem, w2, param);
  985. SymbolVar y_opt;
  986. auto options = gopt::OptimizeForInferenceOptions{};
  987. options.enable_nhwcd4();
  988. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  989. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  990. find_opr<opr::Convolution>(y_opt).param().format);
  991. graph->compile({{y_opt, {}}})
  992. ->to_json()
  993. ->writeto_fpath(
  994. output_file("TestGoptInference.ConvertFormatNHWCD4.json"));
  995. HostTensorND host_y_opt, host_y;
  996. auto func = graph->compile({make_callback_copy(y, host_y),
  997. make_callback_copy(y_opt, host_y_opt)});
  998. func->execute();
  999. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1000. *host_x = *gen({8, 8, 16, 16}, cn);
  1001. func->execute();
  1002. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1003. }
  1004. TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) {
  1005. // hwcd4 is only supported in naive handle
  1006. NaiveMegDNNHandleScope naive_megdnn_handle;
  1007. HostTensorGenerator<> gen;
  1008. auto cn = CompNode::load("cpu0");
  1009. auto graph = ComputingGraph::make();
  1010. graph->options().graph_opt_level = 0;
  1011. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1012. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1013. .rename(name);
  1014. };
  1015. auto host_x = gen({2, 8, 8, 16}, cn);
  1016. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1017. opr::Convolution::Param param;
  1018. param.pad_h = param.pad_w = 1;
  1019. auto w1 = mkcvar("w1", {4, 8, 3, 3}),
  1020. conv1 = opr::Convolution::make(x, w1, param);
  1021. auto w2 = mkcvar("w2", {8, 16, 4, 3, 3, 4}),
  1022. local = opr::Local::make(conv1, w2, param);
  1023. auto w3 = mkcvar("w3", {4, 4, 3, 3}),
  1024. conv2 = opr::Convolution::make(local, w3, param);
  1025. opr::GroupLocal::Param param_group_local;
  1026. param_group_local.pad_h = param_group_local.pad_w = 1;
  1027. auto w4 = mkcvar("w4", {2, 8, 16, 2, 3, 3, 2}),
  1028. group_local = opr::GroupLocal::make(conv2, w4, param_group_local);
  1029. auto w5 = mkcvar("w5", {4, 4, 3, 3}),
  1030. y = opr::Convolution::make(group_local, w5, param);
  1031. SymbolVar y_opt;
  1032. auto options = gopt::OptimizeForInferenceOptions{};
  1033. options.enable_nhwcd4();
  1034. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1035. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1036. find_opr<opr::Convolution>(y_opt).param().format);
  1037. ASSERT_EQ(opr::Local::Param::Format::NCHW,
  1038. find_opr<opr::Local>(y_opt).param().format);
  1039. ASSERT_EQ(opr::GroupLocal::Param::Format::NCHW,
  1040. find_opr<opr::GroupLocal>(y_opt).param().format);
  1041. graph->compile({{y_opt, {}}})
  1042. ->to_json()
  1043. ->writeto_fpath(output_file(
  1044. "TestGoptInference.ConvertFormatNHWCD4LOCAL.json"));
  1045. HostTensorND host_y_opt, host_y;
  1046. auto func = graph->compile({make_callback_copy(y, host_y),
  1047. make_callback_copy(y_opt, host_y_opt)});
  1048. func->execute();
  1049. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1050. }
  1051. TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) {
  1052. // hwcd4 is only supported in naive handle
  1053. NaiveMegDNNHandleScope naive_megdnn_handle;
  1054. HostTensorGenerator<> gen;
  1055. auto cn = CompNode::load("cpu0");
  1056. auto graph = ComputingGraph::make();
  1057. graph->options().graph_opt_level = 0;
  1058. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1059. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1060. .rename(name);
  1061. };
  1062. auto host_x = gen({8, 8, 8, 8}, cn);
  1063. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  1064. opr::Convolution::Param param;
  1065. param.pad_h = param.pad_w = 0;
  1066. auto w0 = mkcvar("w1", {4, 8, 2, 2}),
  1067. conv = opr::Convolution::make(x, w0, param);
  1068. auto w1 = mkcvar("w1", {4, 1, 2, 2}),
  1069. y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {});
  1070. SymbolVar y_opt;
  1071. auto options = gopt::OptimizeForInferenceOptions{};
  1072. options.enable_nhwcd4();
  1073. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1074. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  1075. find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
  1076. ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
  1077. find_opr<opr::Convolution>(y_opt).param().format);
  1078. HostTensorND host_y_opt, host_y;
  1079. auto func = graph->compile({make_callback_copy(y, host_y),
  1080. make_callback_copy(y_opt, host_y_opt)});
  1081. func->execute();
  1082. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1083. }
  1084. TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) {
  1085. // hwcd4 is only supported in naive handle
  1086. NaiveMegDNNHandleScope naive_megdnn_handle;
  1087. HostTensorGenerator<> gen;
  1088. auto cn = CompNode::load("cpu0");
  1089. auto graph = ComputingGraph::make();
  1090. graph->options().graph_opt_level = 0;
  1091. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1092. const DType& dtype) {
  1093. return opr::TypeCvt::make(
  1094. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1095. .rename(name),
  1096. dtype);
  1097. };
  1098. auto host_x = gen({8, 8, 8, 8}, cn);
  1099. auto _x = opr::Host2DeviceCopy::make(*graph, host_x),
  1100. x = opr::TypeCvt::make(_x, dtype::QuantizedS8(0.2f));
  1101. opr::ConvBias::Param param;
  1102. param.pad_h = param.pad_w = 0;
  1103. auto w = mkcvar("w", {4, 8, 3, 3}, dtype::QuantizedS8(0.1f)),
  1104. b = mkcvar("b", {1, 4, 1, 1}, dtype::QuantizedS32(0.02f)),
  1105. y = opr::ConvBias::make(x, w, b, param, {},
  1106. OperatorNodeConfig{dtype::QuantizedS8(0.2f)});
  1107. SymbolVar y_opt;
  1108. auto options = gopt::OptimizeForInferenceOptions{};
  1109. options.enable_nhwcd4();
  1110. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1111. ASSERT_EQ(opr::ConvBias::Param::Format::NHWCD4,
  1112. find_opr<opr::ConvBias>(y_opt).param().format);
  1113. graph->compile({{y_opt, {}}})
  1114. ->to_json()
  1115. ->writeto_fpath(output_file(
  1116. "TestGoptInference.ConvertFormatNHWCD4Qint8.json"));
  1117. auto float_y = opr::TypeCvt::make(y, dtype::Float32()),
  1118. float_y_opt = opr::TypeCvt::make(y_opt, dtype::Float32());
  1119. HostTensorND host_y_opt, host_y;
  1120. auto func = graph->compile({make_callback_copy(float_y, host_y),
  1121. make_callback_copy(float_y_opt, host_y_opt)});
  1122. func->execute();
  1123. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1124. }
  1125. TEST(TestGoptInference, ConvertFormatPadIC) {
  1126. // hwcd4 is only supported in naive handle
  1127. NaiveMegDNNHandleScope naive_megdnn_handle;
  1128. HostTensorGenerator<> gen;
  1129. auto cn = CompNode::load("cpu0");
  1130. auto graph = ComputingGraph::make();
  1131. graph->options().graph_opt_level = 0;
  1132. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1133. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1134. .rename(name);
  1135. };
  1136. auto host_inp1 = gen({1, 6, 128, 128}, cn),
  1137. host_inp2 = gen({1, 6, 256, 256}, cn);
  1138. auto inp1 = opr::Host2DeviceCopy::make(*graph, host_inp1),
  1139. inp2 = opr::Host2DeviceCopy::make(*graph, host_inp2);
  1140. auto shape_tmp = mkcvar("tmp", {256, 256});
  1141. auto shape_of = opr::GetVarShape::make(shape_tmp);
  1142. opr::Resize::Param param_resize;
  1143. param_resize.format = opr::Resize::Param::Format::NCHW;
  1144. auto resize = opr::ResizeForward::make(inp1, shape_of, param_resize);
  1145. auto concat = opr::Concat::make({inp2, resize}, 1);
  1146. opr::Convolution::Param param;
  1147. param.pad_h = param.pad_w = 1;
  1148. param.sparse = opr::Convolution::Param::Sparse::DENSE;
  1149. auto w1 = mkcvar("w1", {12, 12, 3, 3});
  1150. auto y = opr::Convolution::make(concat, w1, param);
  1151. SymbolVar y_opt;
  1152. auto options = gopt::OptimizeForInferenceOptions{};
  1153. options.enable_nhwcd4();
  1154. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1155. HostTensorND host_y_opt, host_y;
  1156. auto func = graph->compile({make_callback_copy(y, host_y),
  1157. make_callback_copy(y_opt, host_y_opt)});
  1158. func->execute();
  1159. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  1160. }
  1161. TEST(TestGoptInference, ConvertBatchNormPass) {
  1162. auto cn = CompNode::load("cpu0");
  1163. HostTensorGenerator<> gen(0, 1, 0);
  1164. auto graph = ComputingGraph::make();
  1165. graph->options().graph_opt_level = 0;
  1166. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1167. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1168. };
  1169. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1170. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1171. .rename(name);
  1172. };
  1173. using Param = opr::BatchNorm::Param;
  1174. Param param(Param::ParamDim::DIM_1C11, Param::FwdMode::INFERENCE);
  1175. TensorShape shp = {1, 3, 1, 1};
  1176. auto x = mkvar("x", {2, 3, 16, 24}), scale = mkcvar("scale", shp),
  1177. bias = mkcvar("bias", shp), mean = mkcvar("mean", shp);
  1178. auto host_variance = gen(shp, cn);
  1179. for (size_t i = 0; i < shp.total_nr_elems(); ++i) {
  1180. host_variance->ptr<float>()[i] =
  1181. std::abs(host_variance->ptr<float>()[i]);
  1182. }
  1183. auto variance = opr::SharedDeviceTensor::make(*graph, *host_variance)
  1184. .rename("variance");
  1185. auto y = opr::BatchNorm::make(x, scale, bias, mean, variance, param)[4];
  1186. SymbolVar y_opt;
  1187. unpack_vector(gopt::optimize_for_inference(
  1188. {y}, gopt::OptimizeForInferenceOptions{}),
  1189. y_opt);
  1190. ASSERT_EQ(0u, find_opr_num<opr::BatchNorm>(y_opt));
  1191. graph->compile({{y_opt, {}}})
  1192. ->to_json()
  1193. ->writeto_fpath(
  1194. output_file("TestGoptInference.ConvertBatchNormPass.json"));
  1195. HostTensorND host_y, host_y_opt;
  1196. auto func = graph->compile({make_callback_copy(y, host_y),
  1197. make_callback_copy(y_opt, host_y_opt)});
  1198. func->execute();
  1199. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
  1200. }
  1201. TEST(TestGoptInference, ConvBiasNonlinearityFusePass) {
  1202. // hwcd4 is only supported in naive handle
  1203. NaiveMegDNNHandleScope naive_megdnn_handle;
  1204. auto cn = CompNode::load("cpu0");
  1205. HostTensorGenerator<> gen;
  1206. auto graph = ComputingGraph::make();
  1207. graph->options().graph_opt_level = 0;
  1208. auto mkvar = [&](const char* name, const TensorShape& shp) {
  1209. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  1210. };
  1211. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  1212. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1213. .rename(name);
  1214. };
  1215. opr::Convolution::Param param;
  1216. auto x = mkvar("x", {5, 8, 16, 24}), w1 = mkcvar("w1", {4, 8, 1, 1}),
  1217. w2 = mkcvar("w2", {4, 4, 3, 3}), b1 = mkcvar("b1", {1, 4, 1, 1}),
  1218. b2 = mkcvar("b2", {1, 4, 1, 1}), w3 = mkcvar("w3", {8, 4, 1, 1}),
  1219. y_cut = opr::Convolution::make(x, w1, param),
  1220. y1 = opr::Elemwise::make({y_cut + b1},
  1221. opr::Elemwise::Param::Mode::RELU);
  1222. param.pad_w = param.pad_h = 1;
  1223. auto y2 = opr::Elemwise::make({opr::Convolution::make(y1, w2, param) + b2},
  1224. opr::Elemwise::Param::Mode::SIGMOID);
  1225. param.pad_w = param.pad_h = 0;
  1226. auto y3 = opr::Convolution::make(y2, w3, param), y_tmp = y3 + x,
  1227. y_expand =
  1228. opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU),
  1229. y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp;
  1230. SymbolVar y_opt;
  1231. auto options = gopt::OptimizeForInferenceOptions{};
  1232. options.enable_nhwcd4().enable_fuse_conv_bias_nonlinearity();
  1233. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1234. ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
  1235. graph->compile({{y_opt, {}}})
  1236. ->to_json()
  1237. ->writeto_fpath(output_file(
  1238. "TestGoptInference.FuseConvBiasNonlinPass.json"));
  1239. HostTensorND host_y, host_y_opt;
  1240. auto func = graph->compile({make_callback_copy(y, host_y),
  1241. make_callback_copy(y_opt, host_y_opt)});
  1242. func->execute();
  1243. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-4);
  1244. }
  1245. TEST(TestGoptInference, ParamMerge) {
  1246. auto cns = load_multiple_xpus(2);
  1247. HostTensorGenerator<> gen;
  1248. auto graph = ComputingGraph::make();
  1249. auto var0 = opr::SharedDeviceTensor::make(*graph, *gen({2, 3}, cns[0])),
  1250. var1 = opr::SharedDeviceTensor::make(*graph, *gen({1, 3}, cns[1])),
  1251. y = var0 + opr::Copy::make(var1, {cns[0]});
  1252. HostTensorND y_expected_val;
  1253. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1254. SymbolVar y_opt;
  1255. unpack_vector(gopt::GraphOptimizer{}
  1256. .add_pass<gopt::ParamMergePass>()
  1257. .apply({{y}})
  1258. .endpoint_vars(),
  1259. y_opt);
  1260. auto opr = y_opt.node()->owner_opr();
  1261. ASSERT_EQ(2u, opr->input().size());
  1262. ASSERT_EQ(2u,
  1263. find_opr<opr::MultipleDeviceTensorHolder>(y_opt).output().size());
  1264. HostTensorND y_got_val;
  1265. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1266. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1267. }
  1268. TEST(TestGoptInference, ParamMergeFormat) {
  1269. auto cns = load_multiple_xpus(2);
  1270. auto make_dv = [](const HostTensorND& hv) {
  1271. TensorLayout layout{hv.layout(), hv.layout().dtype,
  1272. megdnn::Image2DPack4TensorFormat::make_raw(1, 64)};
  1273. auto ret = std::make_shared<DeviceTensorND>(hv.comp_node(), layout);
  1274. ret->copy_from_fixlayout(hv).sync();
  1275. return ret;
  1276. };
  1277. HostTensorGenerator<> gen;
  1278. auto graph = ComputingGraph::make();
  1279. auto var0 = opr::SharedDeviceTensorWithFormat::make(
  1280. *graph, make_dv(*gen({2, 32}, cns[0]))),
  1281. var1 = opr::SharedDeviceTensorWithFormat::make(
  1282. *graph, make_dv(*gen({1, 32}, cns[1]))),
  1283. y = var0 + opr::Copy::make(var1, {cns[0]});
  1284. HostTensorND y_expected_val;
  1285. graph->compile({make_callback_copy(y, y_expected_val)})->execute();
  1286. SymbolVar y_opt;
  1287. unpack_vector(gopt::GraphOptimizer{}
  1288. .add_pass<gopt::ParamMergePass>()
  1289. .apply({{y}})
  1290. .endpoint_vars(),
  1291. y_opt);
  1292. auto opr = y_opt.node()->owner_opr();
  1293. ASSERT_EQ(2u, opr->input().size());
  1294. ASSERT_EQ(2u, find_opr<opr::MultipleDeviceTensorWithFormatHolder>(y_opt)
  1295. .output()
  1296. .size());
  1297. HostTensorND y_got_val;
  1298. graph->compile({make_callback_copy(y_opt, y_got_val)})->execute();
  1299. MGB_ASSERT_TENSOR_EQ(y_expected_val, y_got_val);
  1300. }
  1301. #if MGB_ENABLE_FASTRUN
  1302. TEST(TestGoptInference, AlgoProfile) {
  1303. HostTensorGenerator<> gen;
  1304. auto graph = ComputingGraph::make();
  1305. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1306. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1307. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1308. z = opr::Convolution::make(x, y);
  1309. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1310. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1311. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1312. gopt::enable_opr_algo_profiling_inplace({z + 2.3f});
  1313. ASSERT_EQ(S::PROFILE, conv.execution_policy().strategy);
  1314. }
  1315. #endif
  1316. TEST(TestGoptInference, ProfileCache) {
  1317. HostTensorGenerator<> gen;
  1318. auto graph = ComputingGraph::make();
  1319. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1320. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1321. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1322. z = opr::Convolution::make(x, y);
  1323. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1324. using S = opr::Convolution::ExecutionPolicy::Strategy;
  1325. ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
  1326. gopt::enable_opr_use_profiling_cache_inplace({z + 2.3f});
  1327. ASSERT_EQ(S::PROFILE_HEURISTIC, conv.execution_policy().strategy);
  1328. }
  1329. TEST(TestGoptInference, AlgoWorkspaceLimit) {
  1330. HostTensorGenerator<> gen;
  1331. auto graph = ComputingGraph::make();
  1332. auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
  1333. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  1334. y = opr::Host2DeviceCopy::make(*graph, host_y),
  1335. z = opr::Convolution::make(x, y);
  1336. auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
  1337. ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
  1338. conv.execution_policy_transient().workspace_limit);
  1339. gopt::set_opr_algo_workspace_limit_inplace({z + 2.3f}, 10000u);
  1340. ASSERT_EQ(10000u, conv.execution_policy().workspace_limit);
  1341. }
  1342. TEST_PASS(FuseConvBiasNonlinPass, Basic) {
  1343. auto cn = CompNode::load("xpux");
  1344. HostTensorGenerator<dtype::Int8> gen;
  1345. auto graph = ComputingGraph::make();
  1346. graph->options().graph_opt_level = 0;
  1347. auto mkvar = [&](const char* name, const TensorShape& shp,
  1348. const DType& dtype) {
  1349. return opr::TypeCvt::make(
  1350. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1351. dtype);
  1352. };
  1353. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1354. const DType& dtype) {
  1355. return opr::TypeCvt::make(
  1356. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1357. .rename(name),
  1358. dtype);
  1359. };
  1360. for (auto format : {opr::Convolution::Param::Format::NCHW,
  1361. opr::Convolution::Param::Format::NHWC,
  1362. opr::Convolution::Param::Format::NCHW4}) {
  1363. opr::Convolution::Param param;
  1364. param.format = format;
  1365. SymbolVar x, w, b;
  1366. if (format == opr::Convolution::Param::Format::NHWC) {
  1367. x = mkvar("x", {20, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1368. w = mkcvar("w1", {24, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1369. b = mkcvar("b", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
  1370. } else if (format == opr::Convolution::Param::Format::NCHW) {
  1371. x = mkvar("x", {20, 4, 20, 20}, dtype::QuantizedS8(2.5f)),
  1372. w = mkcvar("w1", {24, 4, 1, 1}, dtype::QuantizedS8(2.5f)),
  1373. b = mkcvar("b", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
  1374. } else {
  1375. mgb_assert(format == opr::Convolution::Param::Format::NCHW4);
  1376. x = mkvar("x", {20, 1, 20, 20, 4}, dtype::QuantizedS8(2.5f)),
  1377. w = mkcvar("w1", {24, 1, 1, 1, 4}, dtype::QuantizedS8(2.5f)),
  1378. b = mkcvar("b", {1, 6, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  1379. }
  1380. auto y = opr::Convolution::make(x, w, param);
  1381. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  1382. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  1383. opr::ConvBias::Param conv_bias_param;
  1384. conv_bias_param.format = format;
  1385. conv_bias_param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1386. auto concret_y = opr::ConvBias::make(
  1387. x, w, b, conv_bias_param, {},
  1388. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1389. check(concret_y, y);
  1390. }
  1391. }
  1392. #if MGB_CUDA
  1393. TEST(TestEnableTensorCore, SmallInputShape) {
  1394. REQUIRE_GPU(1);
  1395. auto cn = CompNode::load("gpu0");
  1396. cn.activate();
  1397. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1398. auto sm_ver = prop.major * 10 + prop.minor;
  1399. if (sm_ver < 75) {
  1400. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1401. "expected: %d)\n",
  1402. sm_ver, 75);
  1403. return;
  1404. }
  1405. HostTensorGenerator<dtype::Int8> gen;
  1406. auto graph = ComputingGraph::make();
  1407. graph->options().graph_opt_level = 0;
  1408. auto mkvar = [&](const char* name, const TensorShape& shp,
  1409. const DType& dtype) {
  1410. return opr::TypeCvt::make(
  1411. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1412. dtype);
  1413. };
  1414. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1415. const DType& dtype) {
  1416. return opr::TypeCvt::make(
  1417. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1418. .rename(name),
  1419. dtype);
  1420. };
  1421. auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
  1422. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1423. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1424. z = mkcvar("b1", {32, 16, 2, 4, 4}, dtype::QuantizedS8(2.5f));
  1425. opr::ConvBias::Param param;
  1426. param.format = opr::ConvBias::Param::Format::NCHW4;
  1427. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1428. param.stride_h = param.stride_w = 2;
  1429. param.pad_h = param.pad_w = 1;
  1430. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1431. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1432. y = opr::ConvBias::make(y, w, b, param, {},
  1433. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1434. y = opr::TypeCvt::make(y, dtype::Float32());
  1435. SymbolVar y_opt;
  1436. SymbolVar y_no_tc;
  1437. {
  1438. auto options = gopt::OptimizeForInferenceOptions{};
  1439. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1440. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1441. }
  1442. {
  1443. auto options = gopt::OptimizeForInferenceOptions{};
  1444. options.enable_fuse_conv_bias_nonlinearity();
  1445. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1446. }
  1447. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1448. ASSERT_EQ(2u, nr_dimshuffle);
  1449. HostTensorND host_y, host_y_opt;
  1450. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1451. make_callback_copy(y_opt, host_y_opt)});
  1452. func->execute();
  1453. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1454. }
  1455. TEST(TestEnableTensorCore, Nchw4Nchw) {
  1456. REQUIRE_GPU(1);
  1457. auto cn = CompNode::load("gpu0");
  1458. cn.activate();
  1459. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1460. auto sm_ver = prop.major * 10 + prop.minor;
  1461. if (sm_ver < 75) {
  1462. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1463. "expected: %d)\n",
  1464. sm_ver, 75);
  1465. return;
  1466. }
  1467. HostTensorGenerator<dtype::Int8> gen;
  1468. auto graph = ComputingGraph::make();
  1469. graph->options().graph_opt_level = 0;
  1470. auto mkvar = [&](const char* name, const TensorShape& shp,
  1471. const DType& dtype) {
  1472. return opr::TypeCvt::make(
  1473. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1474. dtype);
  1475. };
  1476. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1477. const DType& dtype) {
  1478. return opr::TypeCvt::make(
  1479. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1480. .rename(name),
  1481. dtype);
  1482. };
  1483. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
  1484. size_t H, size_t W) -> TensorShape {
  1485. mgb_assert(C % 4 == 0);
  1486. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1487. return {N, C / 4, H, W, 4};
  1488. } else {
  1489. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1490. return {N, C, H, W};
  1491. }
  1492. };
  1493. for (auto format : {opr::ConvBias::Param::Format::NCHW,
  1494. opr::ConvBias::Param::Format::NCHW4}) {
  1495. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
  1496. dtype::QuantizedS8(2.5f)),
  1497. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
  1498. dtype::QuantizedS8(2.5f)),
  1499. b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
  1500. dtype::QuantizedS32(6.25f)),
  1501. z = mkcvar("b1", mkshape(format, 32, 64, 8, 8),
  1502. dtype::QuantizedS8(2.5f));
  1503. opr::ConvBias::Param param;
  1504. param.format = format;
  1505. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1506. param.stride_h = param.stride_w = 2;
  1507. param.pad_h = param.pad_w = 1;
  1508. auto y = opr::ConvBias::make(
  1509. x, w, b, z, param, {},
  1510. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1511. y = opr::ConvBias::make(y, w, b, param, {},
  1512. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1513. y = opr::TypeCvt::make(y, dtype::Float32());
  1514. SymbolVar y_opt;
  1515. SymbolVar y_no_tc;
  1516. {
  1517. auto options = gopt::OptimizeForInferenceOptions{};
  1518. options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
  1519. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1520. }
  1521. {
  1522. auto options = gopt::OptimizeForInferenceOptions{};
  1523. options.enable_fuse_conv_bias_nonlinearity();
  1524. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1525. }
  1526. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1527. std::string json_name;
  1528. ASSERT_EQ(2u, nr_dimshuffle);
  1529. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1530. json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json";
  1531. } else {
  1532. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1533. json_name = "TestGoptInference.Nchw4Nchw.NCHW.json";
  1534. }
  1535. graph->compile({{y_opt, {}}})
  1536. ->to_json()
  1537. ->writeto_fpath(output_file(json_name.c_str()));
  1538. HostTensorND host_y, host_y_opt;
  1539. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1540. make_callback_copy(y_opt, host_y_opt)});
  1541. func->execute();
  1542. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1543. }
  1544. }
  1545. TEST(TestEnableTensorCore, ConvBiasWithZ) {
  1546. REQUIRE_GPU(1);
  1547. auto cn = CompNode::load("gpu0");
  1548. cn.activate();
  1549. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1550. auto sm_ver = prop.major * 10 + prop.minor;
  1551. if (sm_ver < 75) {
  1552. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1553. "expected: %d)\n",
  1554. sm_ver, 75);
  1555. return;
  1556. }
  1557. HostTensorGenerator<dtype::Int8> gen;
  1558. auto graph = ComputingGraph::make();
  1559. graph->options().graph_opt_level = 0;
  1560. auto mkvar = [&](const char* name, const TensorShape& shp,
  1561. const DType& dtype) {
  1562. return opr::TypeCvt::make(
  1563. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1564. dtype);
  1565. };
  1566. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1567. const DType& dtype) {
  1568. return opr::TypeCvt::make(
  1569. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1570. .rename(name),
  1571. dtype);
  1572. };
  1573. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1574. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1575. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1576. z = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1577. opr::ConvBias::Param param;
  1578. param.format = opr::ConvBias::Param::Format::NCHW4;
  1579. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1580. param.stride_h = param.stride_w = 1;
  1581. param.pad_h = param.pad_w = 1;
  1582. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1583. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1584. y = opr::TypeCvt::make(y, dtype::Float32());
  1585. SymbolVar y_opt;
  1586. SymbolVar y_no_tc;
  1587. {
  1588. auto options = gopt::OptimizeForInferenceOptions{};
  1589. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1590. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1591. }
  1592. {
  1593. auto options = gopt::OptimizeForInferenceOptions{};
  1594. options.enable_fuse_conv_bias_nonlinearity();
  1595. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1596. }
  1597. HostTensorND host_y, host_y_opt;
  1598. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1599. make_callback_copy(y_opt, host_y_opt)});
  1600. func->execute();
  1601. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1602. }
  1603. TEST(TestGoptInference, EnableTensorCore) {
  1604. REQUIRE_GPU(1);
  1605. auto cn = CompNode::load("gpu0");
  1606. cn.activate();
  1607. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1608. auto sm_ver = prop.major * 10 + prop.minor;
  1609. if (sm_ver < 75) {
  1610. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1611. "expected: %d)\n",
  1612. sm_ver, 75);
  1613. return;
  1614. }
  1615. HostTensorGenerator<dtype::Int8> gen;
  1616. auto graph = ComputingGraph::make();
  1617. graph->options().graph_opt_level = 0;
  1618. auto mkvar = [&](const char* name, const TensorShape& shp,
  1619. const DType& dtype) {
  1620. return opr::TypeCvt::make(
  1621. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1622. dtype);
  1623. };
  1624. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1625. const DType& dtype) {
  1626. return opr::TypeCvt::make(
  1627. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1628. .rename(name),
  1629. dtype);
  1630. };
  1631. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1632. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1633. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1634. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1635. opr::Convolution::Param param;
  1636. param.format = opr::Convolution::Param::Format::NCHW4;
  1637. param.stride_h = param.stride_w = 1;
  1638. param.pad_h = param.pad_w = 1;
  1639. auto y = opr::Convolution::make(x, w, param);
  1640. y = opr::Elemwise::make({y + b}, opr::Elemwise::Param::Mode::RELU);
  1641. y = opr::TypeCvt::make(y, dtype::QuantizedS8(2.5f));
  1642. auto y1 = y + b1, y2 = opr::Convolution::make(y, w, param),
  1643. y3 = opr::Elemwise::make({y - b1}, opr::Elemwise::Param::Mode::RELU);
  1644. y2 = opr::Elemwise::make({y2 + b}, opr::Elemwise::Param::Mode::RELU),
  1645. y2 = opr::TypeCvt::make(y2, dtype::QuantizedS8(2.5f));
  1646. auto y4 = y1 + y2 + y3;
  1647. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  1648. SymbolVar y_opt;
  1649. SymbolVar y_no_tc;
  1650. {
  1651. auto options = gopt::OptimizeForInferenceOptions{};
  1652. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1653. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  1654. }
  1655. {
  1656. auto options = gopt::OptimizeForInferenceOptions{};
  1657. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1658. unpack_vector(gopt::optimize_for_inference({y4}, options), y_no_tc);
  1659. }
  1660. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1661. ASSERT_EQ(3u, nr_dimshuffle);
  1662. graph->compile({{y_opt, {}}})
  1663. ->to_json()
  1664. ->writeto_fpath(
  1665. output_file("TestGoptInference.EnableTensorCorePass.json"));
  1666. HostTensorND host_y, host_y_opt;
  1667. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1668. make_callback_copy(y_opt, host_y_opt)});
  1669. func->execute();
  1670. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1671. }
  1672. TEST(FuseConvBiasZPass, BlockFuse) {
  1673. REQUIRE_GPU(1);
  1674. auto cn = CompNode::load("gpu0");
  1675. cn.activate();
  1676. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1677. auto sm_ver = prop.major * 10 + prop.minor;
  1678. if (sm_ver < 61) {
  1679. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1680. "expected: %d)\n",
  1681. sm_ver, 61);
  1682. return;
  1683. }
  1684. HostTensorGenerator<dtype::Int8> gen;
  1685. auto graph = ComputingGraph::make();
  1686. graph->options().graph_opt_level = 0;
  1687. auto mkvar = [&](const char* name, const TensorShape& shp,
  1688. const DType& dtype) {
  1689. return opr::TypeCvt::make(
  1690. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1691. dtype);
  1692. };
  1693. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1694. const DType& dtype) {
  1695. return opr::TypeCvt::make(
  1696. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1697. .rename(name),
  1698. dtype);
  1699. };
  1700. using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
  1701. using NonlineMode = opr::ConvBias::Param::NonlineMode;
  1702. for (auto mode :
  1703. {ElemMultiMode::QFUSE_ADD_RELU, ElemMultiMode::QFUSE_ADD_H_SWISH}) {
  1704. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1705. w1 = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1706. b1 = mkcvar("b1", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1707. w2 = mkcvar("w2", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1708. b2 = mkcvar("b2", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1709. w3 = mkcvar("w3", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1710. b3 = mkcvar("b3", {1, 16, 1, 1, 4}, dtype::QuantizedS32(3.0f));
  1711. NonlineMode nonline_mode = NonlineMode::RELU;
  1712. if (mode == ElemMultiMode::QFUSE_ADD_H_SWISH) {
  1713. nonline_mode = NonlineMode::H_SWISH;
  1714. }
  1715. opr::ConvBias::Param param;
  1716. param.format = opr::Convolution::Param::Format::NCHW4;
  1717. param.nonlineMode = nonline_mode;
  1718. param.stride_h = param.stride_w = 1;
  1719. param.pad_h = param.pad_w = 1;
  1720. auto y1 = opr::ConvBias::make(
  1721. x, w1, b1, param, {},
  1722. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1723. param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
  1724. auto y2 = opr::ConvBias::make(
  1725. y1, w2, b2, param, {},
  1726. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  1727. y3 = opr::ElemwiseMultiType::make(
  1728. {y1, y2}, {mode},
  1729. OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
  1730. param.nonlineMode = nonline_mode;
  1731. auto y4 = opr::ConvBias::make(
  1732. y3, w3, b3, param, {},
  1733. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  1734. z = opr::ElemwiseMultiType::make(
  1735. {y3, y4}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  1736. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1737. z = opr::TypeCvt::make(z, dtype::Float32());
  1738. //! fuse z mannually
  1739. auto z0 = opr::ConvBias::make(
  1740. x, w1, b1, param, {},
  1741. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1742. auto z1 = opr::ConvBias::make(
  1743. z0, w2, b2, z0, param, {},
  1744. OperatorNodeConfig{dtype::QuantizedS8(1.2f)}),
  1745. z2 = opr::ConvBias::make(
  1746. z1, w3, b3, param, {},
  1747. OperatorNodeConfig{dtype::QuantizedS8(2.5f)}),
  1748. z4 = opr::ElemwiseMultiType::make(
  1749. {z1, z2}, {opr::ElemwiseMultiType::Mode::QADD},
  1750. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1751. z4 = opr::TypeCvt::make(z4, dtype::Float32());
  1752. SymbolVar z_fuse;
  1753. SymbolVar z_nonfuse;
  1754. {
  1755. auto options = gopt::OptimizeForInferenceOptions{};
  1756. options.enable_fuse_conv_bias_nonlinearity()
  1757. .enable_fuse_conv_bias_with_z();
  1758. unpack_vector(gopt::optimize_for_inference({z}, options), z_fuse);
  1759. }
  1760. {
  1761. auto options = gopt::OptimizeForInferenceOptions{};
  1762. options.enable_fuse_conv_bias_nonlinearity();
  1763. unpack_vector(gopt::optimize_for_inference({z4}, options),
  1764. z_nonfuse);
  1765. }
  1766. auto nr_elem_multi_type =
  1767. find_opr_num<mgb::opr::ElemwiseMultiType>(z_fuse);
  1768. MGB_MARK_USED_VAR(nr_elem_multi_type);
  1769. ASSERT_EQ(1u, nr_elem_multi_type);
  1770. graph->compile({{z_fuse, {}}})
  1771. ->to_json()
  1772. ->writeto_fpath(
  1773. output_file("FuseConvBiasZPass.BlockFuse_fuse.json"));
  1774. graph->compile({{z_nonfuse, {}}})
  1775. ->to_json()
  1776. ->writeto_fpath(output_file(
  1777. "FuseConvBiasZPass.BlockFuse_nonfuse.json"));
  1778. HostTensorND host_z_fuse, host_z_nonfuse;
  1779. auto func =
  1780. graph->compile({make_callback_copy(z_nonfuse, host_z_nonfuse),
  1781. make_callback_copy(z_fuse, host_z_fuse)});
  1782. func->execute();
  1783. MGB_ASSERT_TENSOR_EQ(host_z_fuse, host_z_nonfuse);
  1784. }
  1785. }
  1786. TEST(TestEnableTensorCore, ShuffleMerge) {
  1787. REQUIRE_GPU(1);
  1788. auto cn = CompNode::load("gpu0");
  1789. cn.activate();
  1790. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1791. auto sm_ver = prop.major * 10 + prop.minor;
  1792. if (sm_ver < 75) {
  1793. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1794. "expected: %d)\n",
  1795. sm_ver, 75);
  1796. return;
  1797. }
  1798. HostTensorGenerator<dtype::Int8> gen;
  1799. auto graph = ComputingGraph::make();
  1800. graph->options().graph_opt_level = 0;
  1801. auto mkvar = [&](const char* name, const TensorShape& shp,
  1802. const DType& dtype) {
  1803. return opr::TypeCvt::make(
  1804. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1805. dtype);
  1806. };
  1807. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1808. const DType& dtype) {
  1809. return opr::TypeCvt::make(
  1810. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1811. .rename(name),
  1812. dtype);
  1813. };
  1814. auto nchw2nchw4 = [](SymbolVar x) {
  1815. auto xshp = opr::GetVarShape::make(x);
  1816. auto cv = [&x](int v) { return x.make_scalar(v); };
  1817. auto sub = [&xshp, &cv](int idx) {
  1818. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1819. };
  1820. auto tshp = opr::Concat::make(
  1821. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  1822. auto y0 = opr::Reshape::make(x, tshp);
  1823. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  1824. return y1;
  1825. };
  1826. auto nchw42nchw = [](SymbolVar x) {
  1827. auto xshp = opr::GetVarShape::make(x);
  1828. auto cv = [&x](int v) { return x.make_scalar(v); };
  1829. auto sub = [&xshp, &cv](int idx) {
  1830. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  1831. };
  1832. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  1833. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  1834. auto y1 = opr::Reshape::make(y0, tshp);
  1835. return y1;
  1836. };
  1837. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  1838. w = mkcvar("w1", {64, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
  1839. b = mkcvar("b", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)),
  1840. z = mkvar("b1", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f));
  1841. x = nchw2nchw4(x), w = nchw2nchw4(w), b = nchw2nchw4(b), z = nchw2nchw4(z);
  1842. opr::ConvBias::Param param;
  1843. param.format = opr::ConvBias::Param::Format::NCHW4;
  1844. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1845. param.stride_h = param.stride_w = 1;
  1846. param.pad_h = param.pad_w = 1;
  1847. auto y = opr::ConvBias::make(x, w, b, z, param, {},
  1848. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1849. y = nchw42nchw(y);
  1850. y = opr::TypeCvt::make(y, dtype::Float32());
  1851. SymbolVar y_opt;
  1852. SymbolVar y_no_tc;
  1853. {
  1854. auto options = gopt::OptimizeForInferenceOptions{};
  1855. options.enable_fuse_conv_bias_nonlinearity().enable_nchw32();
  1856. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  1857. }
  1858. {
  1859. auto options = gopt::OptimizeForInferenceOptions{};
  1860. options.enable_fuse_conv_bias_nonlinearity();
  1861. unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
  1862. }
  1863. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  1864. ASSERT_EQ(3u, nr_dimshuffle);
  1865. HostTensorND host_y, host_y_opt;
  1866. auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
  1867. make_callback_copy(y_opt, host_y_opt)});
  1868. func->execute();
  1869. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  1870. }
  1871. #endif
  1872. TEST(FuseConvBiasZPass, Basic) {
  1873. REQUIRE_GPU(1);
  1874. auto cn = CompNode::load("gpu0");
  1875. HostTensorGenerator<dtype::Int8> gen;
  1876. auto graph = ComputingGraph::make();
  1877. graph->options().graph_opt_level = 0;
  1878. auto mkvar = [&](const char* name, const TensorShape& shp,
  1879. const DType& dtype) {
  1880. return opr::TypeCvt::make(
  1881. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1882. dtype);
  1883. };
  1884. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1885. const DType& dtype) {
  1886. return opr::TypeCvt::make(
  1887. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1888. .rename(name),
  1889. dtype);
  1890. };
  1891. auto format = opr::Convolution::Param::Format::NCHW4;
  1892. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1893. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  1894. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  1895. b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  1896. b2 = mkvar("b2", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f));
  1897. opr::ConvBias::Param conv_bias_param;
  1898. conv_bias_param.format = format;
  1899. conv_bias_param.stride_h = conv_bias_param.stride_w = 1;
  1900. conv_bias_param.pad_h = conv_bias_param.pad_w = 1;
  1901. auto y = opr::ConvBias::make(x, w, b, conv_bias_param, {},
  1902. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1903. SymbolVar y_opt;
  1904. // check fuse mode
  1905. for (auto mode : {opr::ElemwiseMultiType::Param::Mode::QADD,
  1906. opr::ElemwiseMultiType::Param::Mode::QMUL,
  1907. opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) {
  1908. auto y1 = opr::ElemwiseMultiType::make(
  1909. {y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1910. {
  1911. auto options = gopt::OptimizeForInferenceOptions{};
  1912. options.enable_fuse_conv_bias_nonlinearity()
  1913. .enable_fuse_conv_bias_with_z()
  1914. .enable_nchw32();
  1915. unpack_vector(gopt::optimize_for_inference({y1}, options), y_opt);
  1916. }
  1917. auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt);
  1918. if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) {
  1919. ASSERT_NE(0u, nr_elemwisemultitype);
  1920. } else
  1921. ASSERT_EQ(0u, nr_elemwisemultitype);
  1922. // fuse convbiasz and z
  1923. if (mode == opr::ElemwiseMultiType::Param::Mode::QADD) {
  1924. auto y2 = opr::ElemwiseMultiType::make(
  1925. {y1, b2}, {mode},
  1926. OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
  1927. {
  1928. auto options = gopt::OptimizeForInferenceOptions{};
  1929. options.enable_fuse_conv_bias_nonlinearity()
  1930. .enable_fuse_conv_bias_with_z()
  1931. .enable_nchw32();
  1932. unpack_vector(gopt::optimize_for_inference({y2}, options),
  1933. y_opt);
  1934. }
  1935. auto nr_elemwisemultitype =
  1936. find_opr_num<opr::ElemwiseMultiType>(y_opt);
  1937. ASSERT_NE(0u, nr_elemwisemultitype);
  1938. }
  1939. }
  1940. }
  1941. #if MGB_CUDA
  1942. TEST(TestGoptInference, EnableCHWN4) {
  1943. REQUIRE_GPU(1);
  1944. auto cn = CompNode::load("gpu0");
  1945. cn.activate();
  1946. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  1947. auto sm_ver = prop.major * 10 + prop.minor;
  1948. if (sm_ver < 61) {
  1949. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  1950. "expected: %d)\n",
  1951. sm_ver, 61);
  1952. return;
  1953. }
  1954. HostTensorGenerator<dtype::Int8> gen;
  1955. auto graph = ComputingGraph::make();
  1956. graph->options().graph_opt_level = 0;
  1957. auto mkvar = [&](const char* name, const TensorShape& shp,
  1958. const DType& dtype) {
  1959. return opr::TypeCvt::make(
  1960. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  1961. dtype);
  1962. };
  1963. auto mkcvar = [&](const char* name, const TensorShape& shp,
  1964. const DType& dtype) {
  1965. return opr::TypeCvt::make(
  1966. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  1967. .rename(name),
  1968. dtype);
  1969. };
  1970. auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C,
  1971. size_t H, size_t W) -> TensorShape {
  1972. mgb_assert(C % 4 == 0);
  1973. if (format == opr::ConvBias::Param::Format::NCHW4) {
  1974. return {N, C / 4, H, W, 4};
  1975. } else {
  1976. mgb_assert(format == opr::ConvBias::Param::Format::NCHW);
  1977. return {N, C, H, W};
  1978. }
  1979. };
  1980. for (auto format : {opr::ConvBias::Param::Format::NCHW,
  1981. opr::ConvBias::Param::Format::NCHW4}) {
  1982. auto x = mkvar("x", mkshape(format, 32, 64, 16, 16),
  1983. dtype::QuantizedS8(2.5f)),
  1984. w = mkcvar("w1", mkshape(format, 64, 64, 3, 3),
  1985. dtype::QuantizedS8(2.5f)),
  1986. b = mkcvar("b", mkshape(format, 1, 64, 1, 1),
  1987. dtype::QuantizedS32(6.25f)),
  1988. b1 = mkvar("b1", mkshape(format, 32, 64, 16, 16),
  1989. dtype::QuantizedS8(2.5f));
  1990. opr::ConvBias::Param param;
  1991. param.format = format;
  1992. param.stride_h = param.stride_w = 1;
  1993. param.pad_h = param.pad_w = 1;
  1994. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  1995. auto y = opr::ConvBiasForward::make(
  1996. x, w, b, param, {},
  1997. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  1998. auto y1 = opr::ElemwiseMultiType::make(
  1999. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2000. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2001. auto y2 = opr::ConvBiasForward::make(
  2002. y, w, b, param, {},
  2003. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2004. auto y3 = opr::ElemwiseMultiType::make(
  2005. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2006. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2007. auto y4 = opr::ElemwiseMultiType::make(
  2008. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2009. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2010. y4 = opr::ElemwiseMultiType::make(
  2011. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2012. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2013. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2014. SymbolVar y_opt;
  2015. SymbolVar y_cudnn;
  2016. {
  2017. auto options = gopt::OptimizeForInferenceOptions{};
  2018. options.enable_chwn4();
  2019. unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
  2020. }
  2021. unpack_vector(gopt::GraphOptimizer{}
  2022. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2023. .add_pass<gopt::FuseConvBiasZPass>()
  2024. .apply({{y4}})
  2025. .endpoint_vars(),
  2026. y_cudnn);
  2027. ASSERT_EQ(opr::ConvBias::Param::Format::CHWN4,
  2028. find_opr<opr::ConvBias>(y_opt).param().format);
  2029. HostTensorND host_y, host_y_opt;
  2030. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2031. make_callback_copy(y_opt, host_y_opt)});
  2032. func->execute();
  2033. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2034. }
  2035. }
  2036. TEST(TestGoptInference, EnableCHWN4WarpPespective) {
  2037. REQUIRE_GPU(1);
  2038. auto cn = CompNode::load("gpu0");
  2039. cn.activate();
  2040. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2041. auto sm_ver = prop.major * 10 + prop.minor;
  2042. if (sm_ver < 61) {
  2043. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2044. "expected: %d)\n",
  2045. sm_ver, 61);
  2046. return;
  2047. }
  2048. HostTensorGenerator<dtype::Int8> gen;
  2049. auto graph = ComputingGraph::make();
  2050. graph->options().graph_opt_level = 0;
  2051. auto mkvar = [&](const char* name, const TensorShape& shp,
  2052. const DType& dtype) {
  2053. return opr::TypeCvt::make(
  2054. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2055. dtype);
  2056. };
  2057. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2058. const DType& dtype) {
  2059. return opr::TypeCvt::make(
  2060. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2061. .rename(name),
  2062. dtype);
  2063. };
  2064. std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>(
  2065. cn, TensorShape{32, 3, 3}, dtype::Float32());
  2066. warp_perspective_mat_gen(*mat, 32, 16, 16);
  2067. auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat");
  2068. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2069. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2070. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2071. opr::ConvBias::Param param;
  2072. param.format = opr::ConvBias::Param::Format::NCHW4;
  2073. param.stride_h = param.stride_w = 1;
  2074. param.pad_h = param.pad_w = 1;
  2075. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2076. auto y = opr::ConvBiasForward::make(
  2077. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2078. opr::WarpPerspective::Param warp_param;
  2079. warp_param.format = opr::WarpPerspective::Param::Format::NCHW4;
  2080. auto y1 = opr::WarpPerspective::make(y, mat_var, TensorShape{16, 16},
  2081. warp_param);
  2082. y1 = opr::TypeCvt::make(y1, dtype::Float32());
  2083. auto nchw42nchw = [](SymbolVar x) {
  2084. auto xshp = opr::GetVarShape::make(x);
  2085. auto cv = [&x](int v) { return x.make_scalar(v); };
  2086. auto sub = [&xshp, &cv](int idx) {
  2087. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2088. };
  2089. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2090. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2091. auto y1 = opr::Reshape::make(y0, tshp);
  2092. return y1;
  2093. };
  2094. y1 = nchw42nchw(y1);
  2095. warp_param.format = opr::WarpPerspective::Param::Format::NCHW;
  2096. auto y2 = opr::WarpPerspective::make(y1, mat_var, TensorShape{16, 16},
  2097. warp_param);
  2098. SymbolVar y_opt;
  2099. SymbolVar y_cudnn;
  2100. {
  2101. auto options = gopt::OptimizeForInferenceOptions{};
  2102. options.enable_chwn4();
  2103. unpack_vector(gopt::optimize_for_inference({y2}, options), y_opt);
  2104. }
  2105. unpack_vector(gopt::GraphOptimizer{}
  2106. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2107. .add_pass<gopt::FuseConvBiasZPass>()
  2108. .apply({{y2}})
  2109. .endpoint_vars(),
  2110. y_cudnn);
  2111. HostTensorND host_y, host_y_opt;
  2112. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2113. make_callback_copy(y_opt, host_y_opt)});
  2114. func->execute();
  2115. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2116. }
  2117. TEST(TestGoptInference, EnableCHWN4Pooling) {
  2118. REQUIRE_GPU(1);
  2119. auto cn = CompNode::load("gpu0");
  2120. cn.activate();
  2121. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2122. auto sm_ver = prop.major * 10 + prop.minor;
  2123. if (sm_ver < 61) {
  2124. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2125. "expected: %d)\n",
  2126. sm_ver, 61);
  2127. return;
  2128. }
  2129. HostTensorGenerator<dtype::Int8> gen;
  2130. auto graph = ComputingGraph::make();
  2131. graph->options().graph_opt_level = 0;
  2132. auto mkvar = [&](const char* name, const TensorShape& shp,
  2133. const DType& dtype) {
  2134. return opr::TypeCvt::make(
  2135. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2136. dtype);
  2137. };
  2138. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2139. const DType& dtype) {
  2140. return opr::TypeCvt::make(
  2141. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2142. .rename(name),
  2143. dtype);
  2144. };
  2145. auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)),
  2146. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2147. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
  2148. opr::ConvBias::Param param;
  2149. param.format = opr::ConvBias::Param::Format::NCHW4;
  2150. param.stride_h = param.stride_w = 1;
  2151. param.pad_h = param.pad_w = 1;
  2152. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2153. auto y = opr::ConvBiasForward::make(
  2154. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2155. opr::Pooling::Param pool_param;
  2156. pool_param.format = opr::Pooling::Param::Format::NCHW4;
  2157. y = opr::Pooling::make(y, pool_param);
  2158. y = opr::TypeCvt::make(y, dtype::Float32());
  2159. auto nchw42nchw = [](SymbolVar x) {
  2160. auto xshp = opr::GetVarShape::make(x);
  2161. auto cv = [&x](int v) { return x.make_scalar(v); };
  2162. auto sub = [&xshp, &cv](int idx) {
  2163. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2164. };
  2165. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2166. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2167. auto y1 = opr::Reshape::make(y0, tshp);
  2168. return y1;
  2169. };
  2170. y = nchw42nchw(y);
  2171. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2172. auto y1 = opr::Pooling::make(y, pool_param);
  2173. SymbolVar y_opt;
  2174. SymbolVar y_cudnn;
  2175. unpack_vector(
  2176. gopt::GraphOptimizer{}
  2177. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2178. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2179. .add_pass<gopt::FuseConvBiasZPass>()
  2180. .apply({{y1}})
  2181. .endpoint_vars(),
  2182. y_opt);
  2183. unpack_vector(gopt::GraphOptimizer{}
  2184. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2185. .add_pass<gopt::FuseConvBiasZPass>()
  2186. .apply({{y1}})
  2187. .endpoint_vars(),
  2188. y_cudnn);
  2189. HostTensorND host_y, host_y_opt;
  2190. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2191. make_callback_copy(y_opt, host_y_opt)});
  2192. func->execute();
  2193. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2194. }
  2195. TEST(TestGoptInference, EnableCHWN4ShuffleRemove) {
  2196. REQUIRE_GPU(1);
  2197. auto cn = CompNode::load("gpu0");
  2198. cn.activate();
  2199. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2200. auto sm_ver = prop.major * 10 + prop.minor;
  2201. if (sm_ver < 61) {
  2202. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2203. "expected: %d)\n",
  2204. sm_ver, 61);
  2205. return;
  2206. }
  2207. HostTensorGenerator<dtype::Int8> gen;
  2208. auto graph = ComputingGraph::make();
  2209. graph->options().graph_opt_level = 0;
  2210. auto mkvar = [&](const char* name, const TensorShape& shp,
  2211. const DType& dtype) {
  2212. return opr::TypeCvt::make(
  2213. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2214. dtype);
  2215. };
  2216. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2217. const DType& dtype) {
  2218. return opr::TypeCvt::make(
  2219. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2220. .rename(name),
  2221. dtype);
  2222. };
  2223. auto nchw2nchw4 = [](SymbolVar x) {
  2224. auto xshp = opr::GetVarShape::make(x);
  2225. auto cv = [&x](int v) { return x.make_scalar(v); };
  2226. auto sub = [&xshp, &cv](int idx) {
  2227. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2228. };
  2229. auto tshp = opr::Concat::make(
  2230. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2231. auto y0 = opr::Reshape::make(x, tshp);
  2232. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2233. return y1;
  2234. };
  2235. auto nchw42nchw = [](SymbolVar x) {
  2236. auto xshp = opr::GetVarShape::make(x);
  2237. auto cv = [&x](int v) { return x.make_scalar(v); };
  2238. auto sub = [&xshp, &cv](int idx) {
  2239. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2240. };
  2241. auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2242. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2243. auto y1 = opr::Reshape::make(y0, tshp);
  2244. return y1;
  2245. };
  2246. auto x = mkvar("x", {32, 64, 16, 16}, dtype::QuantizedS8(2.5f)),
  2247. w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
  2248. b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
  2249. b1 = mkcvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8{2.5f});
  2250. x = nchw2nchw4(x);
  2251. opr::ConvBias::Param param;
  2252. param.format = opr::ConvBias::Param::Format::NCHW4;
  2253. param.stride_h = param.stride_w = 1;
  2254. param.pad_h = param.pad_w = 1;
  2255. param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2256. auto y = opr::ConvBiasForward::make(
  2257. x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2258. auto y1 = opr::ElemwiseMultiType::make(
  2259. {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU,
  2260. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2261. auto y2 = opr::ConvBiasForward::make(
  2262. y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2263. auto y3 = opr::ElemwiseMultiType::make(
  2264. {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB,
  2265. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2266. auto y4 = opr::ElemwiseMultiType::make(
  2267. {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2268. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2269. y4 = opr::ElemwiseMultiType::make(
  2270. {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD,
  2271. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2272. y4 = opr::TypeCvt::make(y4, dtype::Float32());
  2273. y4 = nchw42nchw(y4);
  2274. SymbolVar y_opt;
  2275. SymbolVar y_cudnn;
  2276. unpack_vector(
  2277. gopt::GraphOptimizer{}
  2278. .add_pass<gopt::ParamRedistributePass>()
  2279. .add_pass<gopt::ParamFusePass>()
  2280. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2281. .add_pass<gopt::FuseConvBiasZPass>()
  2282. .add_pass(gopt::EnableCHWN4Pass::make_chwn4_converter())
  2283. .add_pass<gopt::ShuffleShuffleRemovePass>()
  2284. .add_pass<gopt::ParamFusePass>()
  2285. .apply({{y4}})
  2286. .endpoint_vars(),
  2287. y_opt);
  2288. graph->compile({{y_opt, {}}})
  2289. ->to_json()
  2290. ->writeto_fpath(output_file(
  2291. "TestGoptInference.EnableCHWN4ShuffleRemove.json"));
  2292. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2293. ASSERT_EQ(2u, nr_dimshuffle);
  2294. auto nr_reformat = find_opr_num<mgb::opr::RelayoutFormat>(y_opt);
  2295. ASSERT_EQ(0u, nr_reformat);
  2296. unpack_vector(gopt::GraphOptimizer{}
  2297. .add_pass<gopt::FuseConvBiasNonlinPass>()
  2298. .add_pass<gopt::FuseConvBiasZPass>()
  2299. .apply({{y4}})
  2300. .endpoint_vars(),
  2301. y_cudnn);
  2302. HostTensorND host_y, host_y_opt;
  2303. auto func = graph->compile({make_callback_copy(y_cudnn, host_y),
  2304. make_callback_copy(y_opt, host_y_opt)});
  2305. func->execute();
  2306. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2307. }
  2308. TEST(TestGoptInference, ConvertFormatNCHW4GPU) {
  2309. REQUIRE_GPU(1);
  2310. auto cn = CompNode::load("gpu0");
  2311. cn.activate();
  2312. auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
  2313. auto sm_ver = prop.major * 10 + prop.minor;
  2314. if (sm_ver < 61) {
  2315. printf("This testcast ignored due to insufficient cuda cap(got: %d, "
  2316. "expected: %d)\n",
  2317. sm_ver, 61);
  2318. return;
  2319. }
  2320. HostTensorGenerator<dtype::Int8> gen;
  2321. auto graph = ComputingGraph::make();
  2322. graph->options().graph_opt_level = 0;
  2323. auto mkvar = [&](const char* name, const TensorShape& shp,
  2324. const DType& dtype) {
  2325. return opr::TypeCvt::make(
  2326. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2327. dtype);
  2328. };
  2329. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2330. const DType& dtype) {
  2331. return opr::TypeCvt::make(
  2332. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2333. .rename(name),
  2334. dtype);
  2335. };
  2336. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2337. opr::ConvBias::Param param_conv_bias;
  2338. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2339. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2340. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2341. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2342. // dense
  2343. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2344. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2345. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2346. auto conv1 = opr::ConvBiasForward::make(
  2347. x, w1, b1, param_conv_bias, {},
  2348. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2349. // group
  2350. // icpg != 1 && ocpg != 1
  2351. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2352. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2353. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2354. auto conv2 = opr::ConvBiasForward::make(
  2355. conv1, w2, b2, param_conv_bias, {},
  2356. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2357. auto y = opr::TypeCvt::make(conv2, dtype::Float32());
  2358. SymbolVar y_opt;
  2359. {
  2360. auto options = gopt::OptimizeForInferenceOptions{};
  2361. options.enable_nchw4();
  2362. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2363. }
  2364. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2365. find_opr<opr::ConvBias>(y_opt).param().format);
  2366. auto nr_reshape = find_opr_num<mgb::opr::Reshape>(y_opt);
  2367. ASSERT_EQ(2u, nr_reshape);
  2368. graph->compile({{y_opt, {}}})
  2369. ->to_json()
  2370. ->writeto_fpath(output_file(
  2371. "TestGoptInference.ConvertFormatNCHW4GPU.json"));
  2372. HostTensorND host_y, host_y_opt;
  2373. auto func = graph->compile({make_callback_copy(y, host_y),
  2374. make_callback_copy(y_opt, host_y_opt)});
  2375. func->execute();
  2376. MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
  2377. }
  2378. #endif
  2379. TEST(TestGoptInference, ConvertFormatNCHW4NonConvOpr) {
  2380. auto cn = CompNode::load("xpu0");
  2381. HostTensorGenerator<dtype::Int8> gen;
  2382. auto graph = ComputingGraph::make();
  2383. graph->options().graph_opt_level = 0;
  2384. auto mkvar = [&](const char* name, const TensorShape& shp,
  2385. const DType& dtype) {
  2386. return opr::TypeCvt::make(
  2387. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  2388. dtype);
  2389. };
  2390. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2391. const DType& dtype) {
  2392. return opr::TypeCvt::make(
  2393. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2394. .rename(name),
  2395. dtype);
  2396. };
  2397. auto mkcvarf32 = [&](const char* name, const TensorShape& shp) {
  2398. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2399. .rename(name);
  2400. };
  2401. auto x = mkvar("x", {2, 4, 16, 16}, dtype::QuantizedS8(2.5f));
  2402. opr::ConvBias::Param param_conv_bias;
  2403. param_conv_bias.format = opr::ConvBias::Param::Format::NCHW;
  2404. param_conv_bias.stride_h = param_conv_bias.stride_w = 1;
  2405. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2406. param_conv_bias.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
  2407. // dense
  2408. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2409. auto w1 = mkcvar("w1", {8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2410. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2411. auto conv1 = opr::ConvBiasForward::make(
  2412. x, w1, b1, param_conv_bias, {},
  2413. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2414. // test Resize
  2415. auto shape_of = opr::GetVarShape::make(x);
  2416. auto subtensor = opr::Subtensor::make(
  2417. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2418. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2419. opr::Resize::Param param_resize;
  2420. param_resize.format = opr::Resize::Param::Format::NCHW;
  2421. auto resize = opr::ResizeForward::make(conv1, subtensor * 2, param_resize);
  2422. // test WarpPerspective
  2423. auto mat = mkcvarf32("mat", {2, 3, 3}),
  2424. warp = opr::WarpPerspectiveForward::make(
  2425. resize, mat, nullptr, cg::var_from_tensor_shape(x, {32, 32}));
  2426. opr::Pooling::Param pool_param;
  2427. pool_param.format = opr::Pooling::Param::Format::NCHW;
  2428. // test Pooling
  2429. auto pool = opr::Pooling::make(warp, pool_param);
  2430. // group
  2431. // icpg != 1 && ocpg != 1
  2432. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2433. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2434. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2435. auto conv2 = opr::ConvBiasForward::make(
  2436. pool, w2, b2, param_conv_bias, {},
  2437. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2438. auto add = opr::ElemwiseMultiType::make(
  2439. {conv1, conv2}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  2440. OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
  2441. auto y = opr::TypeCvt::make(add, dtype::Float32());
  2442. SymbolVar y_opt;
  2443. {
  2444. auto options = gopt::OptimizeForInferenceOptions{};
  2445. options.enable_nchw4();
  2446. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2447. }
  2448. auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
  2449. ASSERT_EQ(2u, nr_dimshuffle);
  2450. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2451. find_opr<opr::ConvBias>(y_opt).param().format);
  2452. ASSERT_EQ(opr::ResizeForward::Param::Format::NCHW4,
  2453. find_opr<opr::ResizeForward>(y_opt).param().format);
  2454. ASSERT_EQ(opr::WarpPerspectiveForward::Param::Format::NCHW4,
  2455. find_opr<opr::WarpPerspectiveForward>(y_opt).param().format);
  2456. ASSERT_EQ(opr::PoolingForward::Param::Format::NCHW4,
  2457. find_opr<opr::PoolingForward>(y_opt).param().format);
  2458. }
  2459. TEST(TestGoptInference, ConvertFormatNCHW4) {
  2460. HostTensorGenerator<> gen;
  2461. auto cn = CompNode::load("cpu0");
  2462. auto graph = ComputingGraph::make();
  2463. graph->options().graph_opt_level = 0;
  2464. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2465. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2466. };
  2467. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2468. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2469. .rename(name);
  2470. };
  2471. auto x = mkvar("x", {2, 4, 16, 16});
  2472. // ConvBias test dense
  2473. opr::ConvBias::Param param_conv_bias;
  2474. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2475. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2476. auto w1 = mkcvar("w1", {8, 4, 3, 3}), b1 = mkcvar("b1", {1, 8, 1, 1});
  2477. auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
  2478. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2479. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1});
  2480. auto conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  2481. // Convolution
  2482. opr::Convolution::Param param_conv;
  2483. param_conv.pad_h = param_conv.pad_w = 1;
  2484. param_conv.sparse = opr::Convolution::Param::Sparse::DENSE;
  2485. auto w3 = mkcvar("w3", {8, 8, 3, 3});
  2486. auto y = opr::Convolution::make(conv2, w3, param_conv);
  2487. SymbolVar y_opt;
  2488. {
  2489. auto options = gopt::OptimizeForInferenceOptions{};
  2490. options.enable_nchw4();
  2491. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2492. }
  2493. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW,
  2494. find_opr<opr::ConvBias>(y_opt).param().format);
  2495. graph->compile({{y_opt, {}}})
  2496. ->to_json()
  2497. ->writeto_fpath(
  2498. output_file("TestGoptInference.ConvertFormatNCHW4.json"));
  2499. HostTensorND host_y_opt, host_y;
  2500. auto func = graph->compile({make_callback_copy(y, host_y),
  2501. make_callback_copy(y_opt, host_y_opt)});
  2502. func->execute();
  2503. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  2504. }
  2505. TEST(TestGoptInference, ConvertFormatNCHW4Ic3) {
  2506. REQUIRE_GPU(1);
  2507. auto cn = CompNode::load("gpu0");
  2508. cn.activate();
  2509. REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
  2510. HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{
  2511. 1.2f, 127 * 127};
  2512. auto graph = ComputingGraph::make();
  2513. graph->options().graph_opt_level = 0;
  2514. auto mkvar = [&](const char* name, const TensorShape& shp,
  2515. const DType& dtype) {
  2516. return opr::TypeCvt::make(
  2517. opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name),
  2518. dtype);
  2519. };
  2520. auto mkcvar = [&](const char* name, const TensorShape& shp,
  2521. const DType& dtype) {
  2522. return opr::TypeCvt::make(
  2523. opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name),
  2524. dtype);
  2525. };
  2526. auto x = mkvar("x", {2, 3, 16, 16}, dtype::QuantizedS8(2.5f));
  2527. // ConvBias test dense
  2528. opr::ConvBias::Param param_conv_bias;
  2529. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2530. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2531. auto w1 = mkcvar("w1", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
  2532. b1 = mkcvar("b1", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2533. auto conv1 =
  2534. opr::ConvBias::make(x, w1, b1, param_conv_bias, {},
  2535. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2536. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2537. auto w2 = mkcvar("w2", {2, 4, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2538. b2 = mkcvar("b2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2539. auto conv2 =
  2540. opr::ConvBias::make(conv1, w2, b2, param_conv_bias, {},
  2541. OperatorNodeConfig{dtype::QuantizedS8{2.5f}});
  2542. auto y = opr::TypeCvt::make(conv2, dtype::Float32());
  2543. SymbolVar y_opt;
  2544. {
  2545. auto options = gopt::OptimizeForInferenceOptions{};
  2546. options.enable_nchw4();
  2547. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2548. }
  2549. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4,
  2550. find_opr<opr::ConvBias>(y_opt).param().format);
  2551. graph->compile({{y_opt, {}}})
  2552. ->to_json()
  2553. ->writeto_fpath(output_file(
  2554. "TestGoptInference.ConvertFormatNCHW4Ic3.json"));
  2555. HostTensorND host_y_opt, host_y;
  2556. auto func = graph->compile({make_callback_copy(y, host_y),
  2557. make_callback_copy(y_opt, host_y_opt)});
  2558. func->execute();
  2559. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
  2560. }
  2561. TEST(TestGoptInference, ConvertFormatNCHW88) {
  2562. HostTensorGenerator<> gen;
  2563. auto cn = CompNode::load("cpu0");
  2564. auto graph = ComputingGraph::make();
  2565. graph->options().graph_opt_level = 0;
  2566. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2567. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2568. };
  2569. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2570. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2571. .rename(name);
  2572. };
  2573. auto host_x = gen({2, 3, 16, 16}, cn);
  2574. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  2575. //! Hybrid nchw88 mode
  2576. opr::Convolution::Param param_conv;
  2577. param_conv.pad_h = param_conv.pad_w = 1;
  2578. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  2579. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  2580. OperatorNodeConfig("conv1"));
  2581. //! channel wise
  2582. opr::ConvBias::Param param_conv_bias;
  2583. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2584. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2585. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  2586. conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
  2587. //! group
  2588. auto w3 = mkcvar("w3", {1, 8, 8, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  2589. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  2590. auto shape_of = opr::GetVarShape::make(conv3);
  2591. auto subtensor = opr::Subtensor::make(
  2592. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2593. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2594. opr::Resize::Param param_resize;
  2595. param_resize.format = opr::Resize::Param::Format::NCHW;
  2596. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  2597. auto mat = mkcvar("mat", {2, 3, 3}),
  2598. warp = opr::WarpPerspectiveForward::make(
  2599. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  2600. auto b = mkvar("b", {1, 8, 1, 1}),
  2601. elem = opr::Elemwise::make({warp + b},
  2602. opr::Elemwise::Param::Mode::RELU);
  2603. //! Dense
  2604. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2605. auto w4 = mkcvar("w4", {2, 6, 4, 3, 3}), b4 = mkcvar("b4", {1, 12, 1, 1}),
  2606. conv4 = opr::ConvBias::make(elem, w4, b4, param_conv_bias);
  2607. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2608. auto w5 = mkcvar("w5", {8, 12, 3, 3}), b5 = mkcvar("b5", {1, 8, 1, 1}),
  2609. conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias);
  2610. auto w6 = mkcvar("w6", {8, 8, 3, 3}), b6 = mkcvar("b6", {1, 8, 1, 1}),
  2611. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
  2612. SymbolVar y_opt;
  2613. {
  2614. auto options = gopt::OptimizeForInferenceOptions{};
  2615. options.enable_nchw88();
  2616. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2617. }
  2618. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
  2619. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  2620. ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
  2621. find_opr<opr::ConvBias>(y_opt).param().format);
  2622. graph->compile({{y_opt, {}}})
  2623. ->to_json()
  2624. ->writeto_fpath(
  2625. output_file("TestGoptInference.ConvertFormatNCHW88.json"));
  2626. HostTensorND host_y_opt, host_y;
  2627. auto func = graph->compile({make_callback_copy(y, host_y),
  2628. make_callback_copy(y_opt, host_y_opt)});
  2629. func->execute();
  2630. //! meybe go to winograd in x86-32, so set error 1e-1
  2631. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2632. *host_x = *gen({2, 3, 32, 32}, cn);
  2633. func->execute();
  2634. //! meybe go to winograd in x86-32, so set error 1e-1
  2635. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2636. }
  2637. TEST(TestGoptInference, ConvertFormatNCHW44) {
  2638. HostTensorGenerator<> gen;
  2639. auto cn = CompNode::load("cpu0");
  2640. auto graph = ComputingGraph::make();
  2641. graph->options().graph_opt_level = 0;
  2642. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2643. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2644. };
  2645. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2646. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2647. .rename(name);
  2648. };
  2649. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  2650. const DType& dtype) {
  2651. return opr::TypeCvt::make(
  2652. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2653. .rename(name),
  2654. dtype);
  2655. };
  2656. auto host_x = gen({2, 3, 16, 16}, cn);
  2657. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  2658. //! Hybrid nchw44 mode
  2659. opr::Convolution::Param param_conv;
  2660. param_conv.pad_h = param_conv.pad_w = 1;
  2661. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  2662. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  2663. OperatorNodeConfig("conv1"));
  2664. //! no supported hybrid nchw44
  2665. opr::ConvBias::Param param_conv_bias_pad0;
  2666. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  2667. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  2668. auto conv1_f1 = opr::ConvBias::make(x, w1_f1, param_conv_bias_pad0, {},
  2669. OperatorNodeConfig("conv1_f1"));
  2670. auto conv1_add = conv1_f1 * conv1;
  2671. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  2672. //! s8 dense conv
  2673. opr::ConvBias::Param param_conv_bias;
  2674. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2675. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  2676. auto b1_2 = mkcvar_dtype("b1_2", {1, 8, 1, 1}, dtype::QuantizedS32(6.25f));
  2677. auto conv_1_2 = opr::ConvBias::make(
  2678. conv_1_q8, w1_2, b1_2, param_conv_bias, {},
  2679. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  2680. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  2681. //! channel wise
  2682. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2683. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  2684. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  2685. //! group
  2686. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  2687. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  2688. auto shape_of = opr::GetVarShape::make(conv3);
  2689. auto subtensor = opr::Subtensor::make(
  2690. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2691. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2692. opr::Resize::Param param_resize;
  2693. param_resize.format = opr::Resize::Param::Format::NCHW;
  2694. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  2695. auto mat = mkcvar("mat", {2, 3, 3}),
  2696. warp = opr::WarpPerspectiveForward::make(
  2697. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  2698. auto b = mkvar("b", {1, 8, 1, 1}),
  2699. elem = opr::Elemwise::make({warp + b},
  2700. opr::Elemwise::Param::Mode::RELU);
  2701. //! Dense
  2702. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2703. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2704. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}),
  2705. b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  2706. conv3_2 = opr::ConvBias::make(elem, w3_2, b3_2, param_conv_bias, {},
  2707. OperatorNodeConfig("conv3_2"));
  2708. //! s8 group conv
  2709. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2710. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  2711. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2712. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  2713. conv3_3_q = opr::ConvBias::make(
  2714. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  2715. OperatorNodeConfig{"conv_3_3_q", cn,
  2716. dtype::QuantizedS8{6.25f}});
  2717. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  2718. //! Dense
  2719. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2720. auto w4 = mkcvar("w4", {16, 32, 3, 3}), b4 = mkcvar("b4", {1, 16, 1, 1}),
  2721. conv4 = opr::ConvBias::make(conv3_3, w4, b4, param_conv_bias, {},
  2722. OperatorNodeConfig("conv4"));
  2723. auto w4_1 = mkcvar("w4_1", {16, 32, 1, 1}),
  2724. b4_1 = mkcvar("b4_1", {2, 16, 4, 4}),
  2725. conv4_1 =
  2726. opr::ConvBias::make(conv3_3, w4_1, b4_1, param_conv_bias_pad0,
  2727. {}, OperatorNodeConfig("conv4_1"));
  2728. auto conv4_add = conv4 + conv4_1;
  2729. auto w5 = mkcvar("w5", {6, 16, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  2730. conv5 = opr::ConvBias::make(conv4_add, w5, b5, param_conv_bias, {},
  2731. OperatorNodeConfig("conv5"));
  2732. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  2733. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias, {},
  2734. OperatorNodeConfig("conv6"));
  2735. SymbolVar y_opt;
  2736. auto options = gopt::OptimizeForInferenceOptions{};
  2737. options.enable_fuse_conv_bias_nonlinearity();
  2738. options.enable_nchw44();
  2739. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2740. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2741. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  2742. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  2743. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  2744. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2745. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  2746. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2747. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  2748. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2749. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  2750. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2751. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  2752. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  2753. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  2754. graph->compile({{y_opt, {}}})
  2755. ->to_json()
  2756. ->writeto_fpath(
  2757. output_file("TestGoptInference.ConvertFormatNCHW44.json"));
  2758. HostTensorND host_y_opt, host_y;
  2759. auto func = graph->compile({make_callback_copy(y, host_y),
  2760. make_callback_copy(y_opt, host_y_opt)});
  2761. func->execute();
  2762. //! meybe go to winograd in x86-32, so set error 1e-1
  2763. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2764. *host_x = *gen({2, 3, 32, 32}, cn);
  2765. func->execute();
  2766. //! meybe go to winograd in x86-32, so set error 1e-1
  2767. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2768. }
  2769. TEST(TestGoptInference, ConvertFormatNCHW44MultiInput) {
  2770. HostTensorGenerator<> gen;
  2771. auto cn = CompNode::load("cpu0");
  2772. auto graph = ComputingGraph::make();
  2773. graph->options().graph_opt_level = 0;
  2774. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2775. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2776. };
  2777. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2778. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2779. .rename(name);
  2780. };
  2781. auto host_x1 = gen({1, 8, 16, 16}, cn);
  2782. auto host_x2 = gen({1, 1, 16, 16}, cn);
  2783. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  2784. opr::Convolution::Param param_conv;
  2785. param_conv.pad_h = param_conv.pad_w = 1;
  2786. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  2787. conv1 = opr::Convolution::make(x, w1, param_conv);
  2788. auto b = mkvar("b", {1, 1, 16, 16}),
  2789. y = opr::Elemwise::make({conv1 + b}, opr::Elemwise::Param::Mode::RELU);
  2790. SymbolVar y_opt;
  2791. auto options = gopt::OptimizeForInferenceOptions{};
  2792. options.enable_nchw44();
  2793. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2794. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2795. find_opr<opr::Convolution>(y_opt).param().format);
  2796. graph->compile({{y_opt, {}}})
  2797. ->to_json()
  2798. ->writeto_fpath(output_file(
  2799. "TestGoptInference.ConvertFormatNCHW44MultiInput.json"));
  2800. HostTensorND host_y_opt, host_y;
  2801. auto func = graph->compile({make_callback_copy(y, host_y),
  2802. make_callback_copy(y_opt, host_y_opt)});
  2803. func->execute();
  2804. //! meybe go to winograd in x86-32, so set error 1e-1
  2805. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2806. }
  2807. TEST(TestGoptInference, ConvertFormatNCHW44Reshape) {
  2808. HostTensorGenerator<> gen;
  2809. auto cn = CompNode::load("cpu0");
  2810. auto graph = ComputingGraph::make();
  2811. graph->options().graph_opt_level = 0;
  2812. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2813. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2814. .rename(name);
  2815. };
  2816. auto host_x1 = gen({1, 8, 16, 16}, cn);
  2817. auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
  2818. opr::Convolution::Param param_conv;
  2819. param_conv.pad_h = param_conv.pad_w = 1;
  2820. auto w1 = mkcvar("w1", {8, 8, 3, 3}),
  2821. conv1 = opr::Convolution::make(x, w1, param_conv);
  2822. auto y = opr::Reshape::make(conv1, {8, 16 * 16});
  2823. SymbolVar y_opt;
  2824. auto options = gopt::OptimizeForInferenceOptions{};
  2825. options.enable_nchw44();
  2826. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2827. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2828. find_opr<opr::Convolution>(y_opt).param().format);
  2829. graph->compile({{y_opt, {}}})
  2830. ->to_json()
  2831. ->writeto_fpath(output_file(
  2832. "TestGoptInference.ConvertFormatNCHW44Reshape.json"));
  2833. HostTensorND host_y_opt, host_y;
  2834. auto func = graph->compile({make_callback_copy(y, host_y),
  2835. make_callback_copy(y_opt, host_y_opt)});
  2836. func->execute();
  2837. //! meybe go to winograd in x86-32, so set error 1e-1
  2838. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2839. }
  2840. TEST(TestGoptInference, ConvertFormatNCHW44_DOT) {
  2841. HostTensorGenerator<> gen;
  2842. auto cn = CompNode::load("cpu0");
  2843. auto graph = ComputingGraph::make();
  2844. graph->options().graph_opt_level = 0;
  2845. auto mkvar = [&](const char* name, const TensorShape& shp) {
  2846. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  2847. };
  2848. auto mkcvar = [&](const char* name, const TensorShape& shp) {
  2849. return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2850. .rename(name);
  2851. };
  2852. auto mkcvar_dtype = [&](const char* name, const TensorShape& shp,
  2853. const DType& dtype) {
  2854. return opr::TypeCvt::make(
  2855. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  2856. .rename(name),
  2857. dtype);
  2858. };
  2859. auto host_x = gen({2, 3, 16, 16}, cn);
  2860. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  2861. //! Hybrid nchw44 mode
  2862. opr::Convolution::Param param_conv;
  2863. param_conv.pad_h = param_conv.pad_w = 1;
  2864. auto w1 = mkcvar("w1", {8, 3, 3, 3}),
  2865. conv1 = opr::Convolution::make(x, w1, param_conv, {},
  2866. OperatorNodeConfig("conv1"));
  2867. printf("create conv1 %s\n",
  2868. conv1.node()->owner_opr()->dyn_typeinfo()->name);
  2869. param_conv.pad_h = param_conv.pad_w = 1;
  2870. //! no supported hybrid nchw44
  2871. opr::ConvBias::Param param_conv_bias_pad0;
  2872. param_conv_bias_pad0.pad_h = param_conv_bias_pad0.pad_w = 0;
  2873. auto b1 = mkcvar("b1", {1, 8, 1, 1});
  2874. auto w1_f1 = mkcvar("w1_1", {8, 3, 1, 1});
  2875. auto conv1_f1 = opr::ConvBias::make(x, w1_f1, b1, param_conv_bias_pad0, {},
  2876. OperatorNodeConfig("conv1_f1"));
  2877. //! hybrid dot
  2878. auto x_s = opr::TypeCvt::make(x, dtype::QuantizedS8(2.5f));
  2879. auto w1_3 = mkcvar_dtype("w1_3", {8, 3, 3, 3}, dtype::QuantizedS8(2.5f));
  2880. auto conv1_3_q = opr::Convolution::make(
  2881. x_s, w1_3, param_conv, {},
  2882. OperatorNodeConfig{"conv1_3_q", cn, dtype::QuantizedS8{6.25f}});
  2883. auto conv1_3 = opr::TypeCvt::make(conv1_3_q, dtype::Float32());
  2884. auto conv1_add = conv1_f1 * conv1 * conv1_3;
  2885. auto conv_1_q8 = opr::TypeCvt::make(conv1_add, dtype::QuantizedS8(2.5f));
  2886. //! s8 dense conv
  2887. opr::ConvBias::Param param_conv_bias;
  2888. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2889. auto w1_2 = mkcvar_dtype("w1_2", {8, 8, 3, 3}, dtype::QuantizedS8(2.5f));
  2890. auto conv_1_2 = opr::ConvBias::make(
  2891. conv_1_q8, w1_2, param_conv_bias, {},
  2892. OperatorNodeConfig{"conv_1_2", cn, dtype::QuantizedS8{6.25f}});
  2893. auto conv_1_2_fp32 = opr::TypeCvt::make(conv_1_2, dtype::Float32());
  2894. //! channel wise
  2895. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2896. auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
  2897. conv2 = opr::ConvBias::make(conv_1_2_fp32, w2, b2, param_conv_bias);
  2898. //! group
  2899. auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
  2900. conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
  2901. auto shape_of = opr::GetVarShape::make(conv3);
  2902. auto subtensor = opr::Subtensor::make(
  2903. shape_of, {opr::Subtensor::AxisIndexer::make_interval(
  2904. 0, x.make_scalar(2), None, x.make_scalar(1))});
  2905. opr::Resize::Param param_resize;
  2906. param_resize.format = opr::Resize::Param::Format::NCHW;
  2907. auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
  2908. auto mat = mkcvar("mat", {2, 3, 3}),
  2909. warp = opr::WarpPerspectiveForward::make(
  2910. resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
  2911. auto b = mkvar("b", {1, 8, 1, 1}),
  2912. elem = opr::Elemwise::make({warp + b},
  2913. opr::Elemwise::Param::Mode::RELU);
  2914. //! Dense
  2915. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2916. param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
  2917. auto w3_2 = mkcvar("w3_2", {16, 8, 3, 3}),
  2918. b3_2 = mkcvar("b3_2", {1, 16, 1, 1}),
  2919. conv3_2 = opr::ConvBias::make(elem, w3_2, b3_2, param_conv_bias, {},
  2920. OperatorNodeConfig("conv3_2"));
  2921. //! s8 group conv
  2922. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
  2923. auto conv3_2_q8 = opr::TypeCvt::make(conv3_2, dtype::QuantizedS8(2.5f));
  2924. auto w3_3 = mkcvar_dtype("w3_3", {4, 8, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
  2925. b3_3 = mkcvar_dtype("b3_3", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)),
  2926. conv3_3_q = opr::ConvBias::make(
  2927. conv3_2_q8, w3_3, b3_3, param_conv_bias, {},
  2928. OperatorNodeConfig{"conv_3_3_q", cn,
  2929. dtype::QuantizedS8{6.25f}});
  2930. auto conv3_3 = opr::TypeCvt::make(conv3_3_q, dtype::Float32());
  2931. //! Dense
  2932. param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
  2933. auto w4 = mkcvar("w4", {4, 32, 3, 3}), b4 = mkcvar("b4", {1, 4, 1, 1}),
  2934. conv4 = opr::ConvBias::make(conv3_3, w4, b4, param_conv_bias, {},
  2935. OperatorNodeConfig("conv4"));
  2936. auto w5 = mkcvar("w5", {6, 4, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
  2937. conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias, {},
  2938. OperatorNodeConfig("conv5"));
  2939. auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
  2940. y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias, {},
  2941. OperatorNodeConfig("conv6"));
  2942. SymbolVar y_opt;
  2943. auto options = gopt::OptimizeForInferenceOptions{};
  2944. options.enable_fuse_conv_bias_nonlinearity();
  2945. options.enable_nchw44_dot();
  2946. unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
  2947. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2948. find_opr<opr::Convolution>(y_opt, "conv1").param().format);
  2949. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  2950. find_opr<opr::Convolution>(y_opt, "conv1_3_q").param().format);
  2951. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  2952. find_opr<opr::ConvBias>(y_opt, "conv1_f1").param().format);
  2953. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  2954. find_opr<opr::ConvBias>(y_opt, "conv_1_2").param().format);
  2955. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2956. find_opr<opr::ConvBias>(y_opt, "conv3_2").param().format);
  2957. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44_DOT,
  2958. find_opr<opr::ConvBias>(y_opt, "conv_3_3_q").param().format);
  2959. ASSERT_EQ(opr::Convolution::Param::Format::NCHW44,
  2960. find_opr<opr::ConvBias>(y_opt, "conv4").param().format);
  2961. ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
  2962. find_opr<opr::ConvBias>(y_opt, "conv5").param().format);
  2963. graph->compile({{y_opt, {}}})
  2964. ->to_json()
  2965. ->writeto_fpath(output_file(
  2966. "TestGoptInference.ConvertFormatNCHW44_DOT.json"));
  2967. HostTensorND host_y_opt, host_y;
  2968. auto func = graph->compile({make_callback_copy(y, host_y),
  2969. make_callback_copy(y_opt, host_y_opt)});
  2970. func->execute();
  2971. //! meybe go to winograd in x86-32, so set error 1e-1
  2972. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2973. *host_x = *gen({2, 3, 32, 32}, cn);
  2974. func->execute();
  2975. //! meybe go to winograd in x86-32, so set error 1e-1
  2976. MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
  2977. }
  2978. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台