You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor_reformat.cpp 137 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929
  1. /**
  2. * \file src/gopt/impl/tensor_reformat.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/gopt/inference.h"
  13. #include "megbrain/gopt/gtrans.h"
  14. #include "megbrain/gopt/basic_arith.h"
  15. #include "megbrain/graph/event.h"
  16. #include "megbrain/opr/dnn/batch_norm.h"
  17. #include "megbrain/opr/dnn/local.h"
  18. #include "megbrain/utils/shared_set.h"
  19. #include "megbrain/serialization/opr_shallow_copy.h"
  20. #include "megbrain/opr/basic_arith.h"
  21. #include "megbrain/opr/dnn/convolution.h"
  22. #include "megbrain/opr/blas.h"
  23. #include "megbrain/opr/misc.h"
  24. #include "megbrain/opr/utility.h"
  25. #include "megbrain/opr/dnn/pooling.h"
  26. #include "megbrain/opr/tensor_manip.h"
  27. #include "megbrain/opr/imgproc.h"
  28. #include "megbrain/opr/nn_int.h"
  29. #include "megdnn/opr_param_defs.h"
  30. #include "megdnn/tensor_format.h"
  31. #if MGB_ENABLE_TENSOR_RT
  32. #include "megbrain/tensorrt/tensorrt_opr.h"
  33. #endif
  34. #include "megbrain/gopt/misc.h"
  35. using namespace mgb;
  36. using namespace gopt;
  37. /* ================ TensorReformatPass =============== */
  38. /*!
  39. * \brief relayout placeholder opr
  40. *
  41. * RelayoutPlaceholder oprs act as the placeholders of the ComputingGraph
  42. * during graph opt pass `TensorReformatPass`. These oprs are introduced
  43. * into a ComputingGraph for conveniently discovering further optimize
  44. * opportunities (such as fuse consecutive relayouts, translate into
  45. * optimized implementations). They are canonized to have a shape infer, so
  46. * the ouput's shape can be correctly deduced during the opt pass.
  47. *
  48. * Note that the oprs in the ComputingGraph are only used as intermediate
  49. * representations before being translated to MegBrain oprs, so the
  50. * oprs should not get involved in any actual computing.
  51. */
  52. MGB_DEFINE_OPR_CLASS(TensorReformatPass::RelayoutPlaceholder,
  53. cg::SingleCNOperatorNodeBase) // {
  54. public:
  55. //! relayout type of this opr
  56. enum class LayoutType {
  57. NCHW4_TO_NCHW32, //!< from nchw4 layout to nchw32 layout
  58. NCHW32_TO_NCHW4, //!< from nchw32 layout to nchw4 layout
  59. NCHW4_TO_CHWN4, //!< from nchw4 layout to chwn4 layout
  60. CHWN4_TO_NCHW4, //!< from chwn4 layout to nchw4 layout
  61. NCHW_TO_NCHW4, //!< from nchw layout to nchw4 layout
  62. NCHW4_TO_NCHW, //!< from nchw4 layout to nchw layout
  63. NCHW_TO_NCHW88, //!< from nchw layout to nchw88 layout
  64. NCHW88_TO_NCHW, //!< from nchw88 layout to nchw layout
  65. WEIGHT_NCHW_TO_NCHW4_DENSE, //!< weight from nchw layout to nchw4
  66. //!< layout
  67. WEIGHT_NCHW_TO_NCHW4_GROUP, //!< group weight from nchw layout to
  68. //!< nchw4 layout
  69. WEIGHT_NCHW_TO_NCHW88_DENSE, //!< weight from nchw layout to nchw88
  70. //!< layout
  71. WEIGHT_NCHW_TO_NCHW88_GROUP, //!< group weight from nchw layout to
  72. //!< nchw88 layout
  73. WEIGHT_NCHW_TO_NCHW88_CHAN, //!< channel wise weight from nchw layout
  74. //!< to nchw88 layout
  75. //!< the weight layout of input is nchw output is nchw88, special for
  76. //!< shape weight in nchw like {64, 2, 3, 3} to {8, 3, 3, 2, 8}
  77. WEIGHT_HYBIRD_NCHW_NCHW88,
  78. WEIGHT_NCHW_TO_NCHW44_DENSE, //!< weight from nchw layout to nchw44
  79. //!< layout
  80. WEIGHT_NCHW_TO_NCHW44_GROUP, //!< group weight from nchw layout to
  81. //!< nchw44 layout
  82. WEIGHT_NCHW_TO_NCHW44_CHAN, //!< channel wise weight from nchw layout
  83. //!< to nchw44 layout
  84. //!< the weight layout of input is nchw output is nchw44, special for
  85. //!< shape weight in nchw like {64, 2, 3, 3} to {16, 3, 3, 2, 4}
  86. WEIGHT_HYBIRD_NCHW_NCHW44,
  87. WEIGHT_NCHW_TO_NCHW44_DOT_DENSE, //!< weight from NCHW44 layout to
  88. //!< NCHW44_DOT layout dense
  89. WEIGHT_NCHW_TO_NCHW44_DOT_GROUP, //!< weight from NCHW44 layout to
  90. //!< NCHW44_DOT layout group
  91. };
  92. RelayoutPlaceholder(VarNode* src_var, LayoutType layout_type);
  93. /*!
  94. * \param src_var the input var
  95. * \param layout_type tensor layout transform type of this relayout
  96. * placeholder as described in LayoutType
  97. */
  98. static SymbolVar make(VarNode* src_var, LayoutType layout_type);
  99. LayoutType layout_type() const { return m_layout_type; }
  100. private:
  101. void init_output_static_infer_desc() override;
  102. void scn_do_execute() override;
  103. void init_output_comp_node() override;
  104. const LayoutType m_layout_type;
  105. };
  106. MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorReformatPass::RelayoutPlaceholder);
  107. TensorReformatPass::RelayoutPlaceholder::RelayoutPlaceholder(
  108. VarNode* src_var, LayoutType layout_type)
  109. : Super(src_var->owner_graph(), {}, "RelayoutPlaceholder", {src_var}),
  110. m_layout_type{layout_type} {
  111. add_input({src_var});
  112. add_equivalence_component<ScalarHash<LayoutType>>(m_layout_type);
  113. add_output(None)->dtype(src_var->dtype());
  114. }
  115. void TensorReformatPass::RelayoutPlaceholder::scn_do_execute() {
  116. mgb_throw(InternalError, "RelayoutPlaceholder opr can not be executed");
  117. }
  118. void TensorReformatPass::RelayoutPlaceholder::init_output_comp_node() {
  119. output(0)->comp_node(input(0)->comp_node());
  120. }
  121. void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() {
  122. using namespace cg::static_infer;
  123. auto&& mgr = owner_graph()->static_infer_manager();
  124. DepVal deps;
  125. for (auto i : input())
  126. deps.push_back({i, DepType::SHAPE});
  127. auto infer_shape = [this](TensorShape& dst, const InpVal& inp) {
  128. TensorShape inp_shape = inp.val[0].shape();
  129. dst = inp_shape;
  130. if (layout_type() == RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW32) {
  131. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 4);
  132. dst[0] = inp_shape[0];
  133. dst[1] = inp_shape[1] / 8;
  134. dst[2] = inp_shape[2];
  135. dst[3] = inp_shape[3];
  136. dst[4] = inp_shape[4] * 8;
  137. } else if (layout_type() ==
  138. RelayoutPlaceholder::LayoutType::NCHW32_TO_NCHW4) {
  139. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 32);
  140. dst[0] = inp_shape[0];
  141. dst[1] = inp_shape[1] * 8;
  142. dst[2] = inp_shape[2];
  143. dst[3] = inp_shape[3];
  144. dst[4] = inp_shape[4] / 8;
  145. } else if (layout_type() ==
  146. RelayoutPlaceholder::LayoutType::NCHW4_TO_CHWN4) {
  147. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 4);
  148. dst[0] = inp_shape[1];
  149. dst[1] = inp_shape[2];
  150. dst[2] = inp_shape[3];
  151. dst[3] = inp_shape[0];
  152. dst[4] = inp_shape[4];
  153. } else if (layout_type() ==
  154. RelayoutPlaceholder::LayoutType::CHWN4_TO_NCHW4) {
  155. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 4);
  156. dst[0] = inp_shape[3];
  157. dst[1] = inp_shape[0];
  158. dst[2] = inp_shape[1];
  159. dst[3] = inp_shape[2];
  160. dst[4] = inp_shape[4];
  161. } else if (layout_type() ==
  162. RelayoutPlaceholder::LayoutType::NCHW_TO_NCHW4){
  163. mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 4 == 0);
  164. dst.ndim = 5;
  165. dst[0] = inp_shape[0];
  166. dst[1] = inp_shape[1] / 4;
  167. dst[2] = inp_shape[2];
  168. dst[3] = inp_shape[3];
  169. dst[4] = 4;
  170. } else if (layout_type() ==
  171. RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW){
  172. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 4);
  173. dst.ndim = 4;
  174. dst[0] = inp_shape[0];
  175. dst[1] = inp_shape[1] * 4;
  176. dst[2] = inp_shape[2];
  177. dst[3] = inp_shape[3];
  178. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  179. WEIGHT_NCHW_TO_NCHW4_DENSE) {
  180. mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 4 == 0);
  181. dst.ndim = 5;
  182. dst[0] = inp_shape[0];
  183. dst[1] = inp_shape[1] / 4;
  184. dst[2] = inp_shape[2];
  185. dst[3] = inp_shape[3];
  186. dst[4] = 4;
  187. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  188. WEIGHT_NCHW_TO_NCHW4_GROUP) {
  189. mgb_assert(inp_shape.ndim == 5 && inp_shape[2] % 4 == 0);
  190. dst.ndim = 6;
  191. dst[0] = inp_shape[0];
  192. dst[1] = inp_shape[1];
  193. dst[2] = inp_shape[2] / 4;
  194. dst[3] = inp_shape[3];
  195. dst[4] = inp_shape[4];
  196. dst[5] = 4;
  197. }else if (layout_type() ==
  198. RelayoutPlaceholder::LayoutType::NCHW_TO_NCHW88) {
  199. mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 8 == 0);
  200. dst.ndim = 5;
  201. dst[0] = inp_shape[0];
  202. dst[1] = inp_shape[1] / 8;
  203. dst[2] = inp_shape[2];
  204. dst[3] = inp_shape[3];
  205. dst[4] = 8;
  206. } else if (layout_type() ==
  207. RelayoutPlaceholder::LayoutType::NCHW88_TO_NCHW) {
  208. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 8);
  209. dst.ndim = 4;
  210. dst[0] = inp_shape[0];
  211. dst[1] = inp_shape[1] * 8;
  212. dst[2] = inp_shape[2];
  213. dst[3] = inp_shape[3];
  214. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  215. WEIGHT_NCHW_TO_NCHW88_DENSE) {
  216. mgb_assert(inp_shape.ndim == 4 && inp_shape[0] % 8 == 0 &&
  217. inp_shape[1] % 8 == 0);
  218. dst.ndim = 6;
  219. dst[0] = inp_shape[0] / 8;
  220. dst[1] = inp_shape[1] / 8;
  221. dst[2] = inp_shape[2];
  222. dst[3] = inp_shape[3];
  223. dst[4] = 8;
  224. dst[5] = 8;
  225. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  226. WEIGHT_NCHW_TO_NCHW88_GROUP) {
  227. mgb_assert(inp_shape.ndim == 5 && inp_shape[1] % 8 == 0 &&
  228. inp_shape[2] % 8 == 0);
  229. dst.ndim = 7;
  230. dst[0] = inp_shape[0];
  231. dst[1] = inp_shape[1] / 8;
  232. dst[2] = inp_shape[2] / 8;
  233. dst[3] = inp_shape[3];
  234. dst[4] = inp_shape[4];
  235. dst[5] = 8;
  236. dst[6] = 8;
  237. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  238. WEIGHT_NCHW_TO_NCHW88_CHAN) {
  239. mgb_assert(inp_shape.ndim == 5 && inp_shape[1] == 1 &&
  240. inp_shape[2] == 1 && inp_shape[0] % 8 == 0);
  241. dst.ndim = 6;
  242. dst[0] = inp_shape[0] / 8;
  243. dst[1] = inp_shape[1];
  244. dst[2] = inp_shape[2];
  245. dst[3] = inp_shape[3];
  246. dst[4] = inp_shape[4];
  247. dst[5] = 8;
  248. } else if (layout_type() ==
  249. RelayoutPlaceholder::LayoutType::WEIGHT_HYBIRD_NCHW_NCHW88) {
  250. mgb_assert(inp_shape.ndim == 4 && inp_shape[0] % 8 == 0);
  251. dst.ndim = 5;
  252. dst[0] = inp_shape[0] / 8;
  253. dst[1] = inp_shape[2];
  254. dst[2] = inp_shape[3];
  255. dst[3] = inp_shape[1];
  256. dst[4] = 8;
  257. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  258. WEIGHT_NCHW_TO_NCHW44_DENSE ||
  259. layout_type() == RelayoutPlaceholder::LayoutType::
  260. WEIGHT_NCHW_TO_NCHW44_DOT_DENSE) {
  261. mgb_assert(inp_shape.ndim == 4 && inp_shape[0] % 4 == 0 &&
  262. inp_shape[1] % 4 == 0);
  263. dst.ndim = 6;
  264. dst[0] = inp_shape[0] / 4;
  265. dst[1] = inp_shape[1] / 4;
  266. dst[2] = inp_shape[2];
  267. dst[3] = inp_shape[3];
  268. dst[4] = 4;
  269. dst[5] = 4;
  270. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  271. WEIGHT_NCHW_TO_NCHW44_GROUP ||
  272. layout_type() == RelayoutPlaceholder::LayoutType::
  273. WEIGHT_NCHW_TO_NCHW44_DOT_GROUP) {
  274. mgb_assert(inp_shape.ndim == 5 && inp_shape[1] % 4 == 0 &&
  275. inp_shape[2] % 4 == 0);
  276. dst.ndim = 7;
  277. dst[0] = inp_shape[0];
  278. dst[1] = inp_shape[1] / 4;
  279. dst[2] = inp_shape[2] / 4;
  280. dst[3] = inp_shape[3];
  281. dst[4] = inp_shape[4];
  282. dst[5] = 4;
  283. dst[6] = 4;
  284. } else if (layout_type() == RelayoutPlaceholder::LayoutType::
  285. WEIGHT_NCHW_TO_NCHW44_CHAN) {
  286. mgb_assert(inp_shape.ndim == 5 && inp_shape[1] == 1 &&
  287. inp_shape[2] == 1 && inp_shape[0] % 4 == 0);
  288. dst.ndim = 6;
  289. dst[0] = inp_shape[0] / 4;
  290. dst[1] = inp_shape[1];
  291. dst[2] = inp_shape[2];
  292. dst[3] = inp_shape[3];
  293. dst[4] = inp_shape[4];
  294. dst[5] = 4;
  295. } else {
  296. mgb_assert(
  297. layout_type() ==
  298. RelayoutPlaceholder::LayoutType::WEIGHT_HYBIRD_NCHW_NCHW44);
  299. mgb_assert(inp_shape.ndim == 4 && inp_shape[0] % 4 == 0);
  300. dst.ndim = 5;
  301. dst[0] = inp_shape[0] / 4;
  302. dst[1] = inp_shape[2];
  303. dst[2] = inp_shape[3];
  304. dst[3] = inp_shape[1];
  305. dst[4] = 4;
  306. }
  307. return true;
  308. };
  309. mgr.register_shape_infer(output(0), {SourceType::DEP, deps, infer_shape});
  310. }
  311. SymbolVar TensorReformatPass::RelayoutPlaceholder::make(
  312. VarNode* src_var, LayoutType layout_type) {
  313. return src_var->owner_graph()
  314. ->insert_opr(
  315. std::make_unique<RelayoutPlaceholder>(src_var, layout_type))
  316. ->output(0);
  317. }
  318. void TensorReformatPass::insert_pass(OptState& opt) const {
  319. opt.set_var_replace_check_flag(m_var_replace_check_flag);
  320. auto rewriter = opt.graph().make_rewriter();
  321. VarNodeArray new_inp_cache;
  322. auto on_opr = [this, &opt, &rewriter,
  323. &new_inp_cache](OperatorNodeBase* opr) {
  324. auto it = m_opr_replace_func.find(opr->dyn_typeinfo());
  325. if (it != m_opr_replace_func.end()) {
  326. auto& new_inp = new_inp_cache;
  327. new_inp.clear();
  328. new_inp.reserve(opr->input().size());
  329. for (auto&& inp : opr->input()) {
  330. new_inp.push_back(rewriter.get_var(inp));
  331. }
  332. auto new_opr = (it->second)(opr, new_inp);
  333. auto &&out0 = opr->output(), &&out1 = new_opr->output();
  334. mgb_assert(out0.size() == out1.size(),
  335. "bad opr replace: src=%s{%s} dst=%s{%s}, src.size=%zu "
  336. "dst.size=%zu",
  337. opr->cname(), opr->dyn_typeinfo()->name,
  338. new_opr->cname(), new_opr->dyn_typeinfo()->name,
  339. out0.size(), out1.size());
  340. for (size_t i = 0; i < out0.size(); ++i) {
  341. if (!out0[i]->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) {
  342. mgb_assert(!out1[i]->contain_flag(
  343. VarNode::Flag::VOLATILE_CONTENT));
  344. auto src = out0[i];
  345. auto dst = out1[i];
  346. if (opt.graph().endpoint_contain(src)) {
  347. // additional process on endpoint var node
  348. dst = on_graph_endpoint_var(dst, src);
  349. }
  350. rewriter.replace_var(src, dst, nullptr);
  351. }
  352. }
  353. } else {
  354. rewriter.auto_replace_outputs(opr);
  355. }
  356. };
  357. opt.graph().iter(on_opr);
  358. rewriter.apply_inplace();
  359. }
  360. void TensorReformatPass::translate_pass(OptState& opt) const {
  361. ThinHashMap<RelayoutPlaceholder::LayoutType,
  362. thin_function<VarNode*(VarNode*)>>
  363. reformat;
  364. using LayoutType = RelayoutPlaceholder::LayoutType;
  365. reformat[LayoutType::NCHW4_TO_CHWN4] = [](VarNode* inp) -> VarNode* {
  366. megdnn::param::RelayoutFormat param;
  367. param.mode = megdnn::param::RelayoutFormat::Mode::NCHW4_CHWN4;
  368. auto reformat = opr::RelayoutFormat::make(inp, param);
  369. return reformat.node();
  370. };
  371. reformat[LayoutType::CHWN4_TO_NCHW4] = [](VarNode* inp) -> VarNode* {
  372. megdnn::param::RelayoutFormat param;
  373. param.mode = megdnn::param::RelayoutFormat::Mode::CHWN4_NCHW4;
  374. auto reformat = opr::RelayoutFormat::make(inp, param);
  375. return reformat.node();
  376. };
  377. reformat[LayoutType::NCHW4_TO_NCHW32] = [](VarNode* inp) -> VarNode* {
  378. auto x = SymbolVar(inp);
  379. auto xshp = opr::GetVarShape::make(x);
  380. auto cv = [&x](int v) { return x.make_scalar(v); };
  381. auto sub = [&xshp, &cv](int idx) {
  382. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  383. };
  384. auto tshp0 = opr::Concat::make(
  385. {sub(0), sub(1) / 8, cv(8), sub(2), sub(3), sub(4)}, 0),
  386. tshp1 = opr::Concat::make(
  387. {sub(0), sub(1) / 8, sub(2), sub(3), sub(4) * 8}, 0);
  388. auto y0 = opr::Reshape::make(x, tshp0);
  389. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2, 5});
  390. auto y2 = opr::Reshape::make(y1, tshp1);
  391. return y2.node();
  392. };
  393. reformat[LayoutType::NCHW32_TO_NCHW4] = [](VarNode* inp) -> VarNode* {
  394. auto x = SymbolVar(inp);
  395. auto xshp = opr::GetVarShape::make(x);
  396. auto cv = [&x](int v) { return x.make_scalar(v); };
  397. auto sub = [&xshp, &cv](int idx) {
  398. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  399. };
  400. auto tshp0 = opr::Concat::make(
  401. {sub(0), sub(1), sub(2), sub(3), cv(8), sub(4) / 8}, 0),
  402. tshp1 = opr::Concat::make(
  403. {sub(0), sub(1) * 8, sub(2), sub(3), sub(4) / 8}, 0);
  404. auto y0 = opr::Reshape::make(x, tshp0);
  405. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 4, 2, 3, 5});
  406. auto y2 = opr::Reshape::make(y1, tshp1);
  407. return y2.node();
  408. };
  409. reformat[LayoutType::NCHW_TO_NCHW4] = [](VarNode* inp) -> VarNode* {
  410. auto x = SymbolVar(inp);
  411. auto xshp = opr::GetVarShape::make(x);
  412. auto cv = [&x](int v) { return x.make_scalar(v); };
  413. auto sub = [&xshp, &cv](int idx) {
  414. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  415. };
  416. auto tshp0 = opr::Concat::make(
  417. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0),
  418. tshp1 = opr::Concat::make(
  419. {sub(0), sub(1) / 4, sub(2), sub(3), cv(4)}, 0);
  420. auto y0 = opr::Reshape::make(x, tshp0);
  421. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  422. auto y2 = opr::Reshape::make(y1, tshp1);
  423. return y2.node();
  424. };
  425. reformat[LayoutType::NCHW4_TO_NCHW] = [](VarNode* inp) -> VarNode* {
  426. auto x = SymbolVar(inp);
  427. auto xshp = opr::GetVarShape::make(x);
  428. auto cv = [&x](int v) { return x.make_scalar(v); };
  429. auto sub = [&xshp, &cv](int idx) {
  430. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  431. };
  432. auto tshp0 = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  433. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  434. auto y1 = opr::Reshape::make(y0, tshp0);
  435. return y1.node();
  436. };
  437. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW4_DENSE] = [](VarNode* inp) -> VarNode* {
  438. auto x = SymbolVar(inp);
  439. auto xshp = opr::GetVarShape::make(x);
  440. auto cv = [&x](int v) { return x.make_scalar(v); };
  441. auto sub = [&xshp, &cv](int idx) {
  442. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  443. };
  444. auto tshp0 = opr::Concat::make(
  445. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0),
  446. tshp1 = opr::Concat::make(
  447. {sub(0), sub(1) / 4, sub(2), sub(3), cv(4)}, 0);
  448. auto y0 = opr::Reshape::make(x, tshp0);
  449. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  450. auto y2 = opr::Reshape::make(y1, tshp1);
  451. return y2.node();
  452. };
  453. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW4_GROUP] = [](VarNode* inp) -> VarNode* {
  454. auto x = SymbolVar(inp);
  455. auto xshp = opr::GetVarShape::make(x);
  456. auto cv = [&x](int v) { return x.make_scalar(v); };
  457. auto sub = [&xshp, &cv](int idx) {
  458. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  459. };
  460. auto tshp0 = opr::Concat::make(
  461. {sub(0), sub(1), sub(2) / 4, cv(4), sub(3), sub(4)}, 0),
  462. tshp1 = opr::Concat::make(
  463. {sub(0), sub(1), sub(2) / 4, sub(3), sub(4), cv(4)}, 0);
  464. auto y0 = opr::Reshape::make(x, tshp0);
  465. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 2, 4, 5, 3});
  466. auto y2 = opr::Reshape::make(y1, tshp1);
  467. return y2.node();
  468. };
  469. reformat[LayoutType::NCHW_TO_NCHW88] = [](VarNode* inp) -> VarNode* {
  470. auto x = SymbolVar(inp);
  471. auto xshp = opr::GetVarShape::make(x);
  472. auto cv = [&x](int v) { return x.make_scalar(v); };
  473. auto sub = [&xshp, &cv](int idx) {
  474. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  475. };
  476. auto tshp0 = opr::Concat::make(
  477. {sub(0), sub(1) / 8, cv(8), sub(2), sub(3)}, 0),
  478. tshp1 = opr::Concat::make(
  479. {sub(0), sub(1) / 8, sub(2), sub(3), cv(8)}, 0);
  480. auto y0 = opr::Reshape::make(x, tshp0);
  481. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  482. auto y2 = opr::Reshape::make(y1, tshp1);
  483. return y2.node();
  484. };
  485. reformat[LayoutType::NCHW88_TO_NCHW] = [](VarNode* inp) -> VarNode* {
  486. auto x = SymbolVar(inp);
  487. auto xshp = opr::GetVarShape::make(x);
  488. auto cv = [&x](int v) { return x.make_scalar(v); };
  489. auto sub = [&xshp, &cv](int idx) {
  490. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  491. };
  492. auto tshp0 = opr::Concat::make({sub(0), sub(1) * 8, sub(2), sub(3)}, 0);
  493. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  494. auto y1 = opr::Reshape::make(y0, tshp0);
  495. return y1.node();
  496. };
  497. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW88_DENSE] =
  498. [](VarNode* inp) -> VarNode* {
  499. auto x = SymbolVar(inp);
  500. auto xshp = opr::GetVarShape::make(x);
  501. auto cv = [&x](int v) { return x.make_scalar(v); };
  502. auto sub = [&xshp, &cv](int idx) {
  503. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  504. };
  505. auto tshp0 = opr::Concat::make(
  506. {sub(0) / 8, cv(8), sub(1) / 8, cv(8), sub(2), sub(3)}, 0),
  507. tshp1 = opr::Concat::make(
  508. {sub(0) / 8, sub(1) / 8, sub(2), sub(3), cv(8), cv(8)}, 0);
  509. auto y0 = opr::Reshape::make(x, tshp0);
  510. auto y1 = opr::Dimshuffle::make(y0, {0, 2, 4, 5, 3, 1});
  511. auto y2 = opr::Reshape::make(y1, tshp1);
  512. return y2.node();
  513. };
  514. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW88_GROUP] =
  515. [](VarNode* inp) -> VarNode* {
  516. auto x = SymbolVar(inp);
  517. auto xshp = opr::GetVarShape::make(x);
  518. auto cv = [&x](int v) { return x.make_scalar(v); };
  519. auto sub = [&xshp, &cv](int idx) {
  520. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  521. };
  522. auto tshp0 = opr::Concat::make({sub(0), sub(1) / 8, cv(8), sub(2) / 8,
  523. cv(8), sub(3), sub(4)},
  524. 0),
  525. tshp1 = opr::Concat::make({sub(0), sub(1) / 8, sub(2) / 8, sub(3),
  526. sub(4), cv(8), cv(8)},
  527. 0);
  528. auto y0 = opr::Reshape::make(x, tshp0);
  529. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 5, 6, 4, 2});
  530. auto y2 = opr::Reshape::make(y1, tshp1);
  531. return y2.node();
  532. };
  533. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW88_CHAN] =
  534. [](VarNode* inp) -> VarNode* {
  535. auto x = SymbolVar(inp);
  536. auto xshp = opr::GetVarShape::make(x);
  537. auto cv = [&x](int v) { return x.make_scalar(v); };
  538. auto sub = [&xshp, &cv](int idx) {
  539. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  540. };
  541. auto tshp0 = opr::Concat::make(
  542. {sub(0) / 8, cv(8), sub(1), sub(2), sub(3), sub(4)}, 0),
  543. tshp1 = opr::Concat::make(
  544. {sub(0) / 8, sub(1), sub(2), sub(3), sub(4), cv(8)}, 0);
  545. auto y0 = opr::Reshape::make(x, tshp0);
  546. auto y1 = opr::Dimshuffle::make(y0, {0, 2, 3, 4, 5, 1});
  547. auto y2 = opr::Reshape::make(y1, tshp1);
  548. return y2.node();
  549. };
  550. reformat[LayoutType::WEIGHT_HYBIRD_NCHW_NCHW88] =
  551. [](VarNode* inp) -> VarNode* {
  552. auto x = SymbolVar(inp);
  553. auto xshp = opr::GetVarShape::make(x);
  554. auto cv = [&x](int v) { return x.make_scalar(v); };
  555. auto sub = [&xshp, &cv](int idx) {
  556. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  557. };
  558. auto tshp0 = opr::Concat::make(
  559. {sub(0) / 8, cv(8), sub(1), sub(2), sub(3)}, 0),
  560. tshp1 = opr::Concat::make(
  561. {sub(0) / 8, sub(2), sub(3), sub(1), cv(8)}, 0);
  562. auto y0 = opr::Reshape::make(x, tshp0);
  563. auto y1 = opr::Dimshuffle::make(y0, {0, 3, 4, 2, 1});
  564. auto y2 = opr::Reshape::make(y1, tshp1);
  565. return y2.node();
  566. };
  567. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_DENSE] =
  568. [](VarNode* inp) -> VarNode* {
  569. auto x = SymbolVar(inp);
  570. auto xshp = opr::GetVarShape::make(x);
  571. auto cv = [&x](int v) { return x.make_scalar(v); };
  572. auto sub = [&xshp, &cv](int idx) {
  573. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  574. };
  575. auto tshp0 = opr::Concat::make(
  576. {sub(0) / 4, cv(4), sub(1) / 4, cv(4), sub(2), sub(3)}, 0),
  577. tshp1 = opr::Concat::make(
  578. {sub(0) / 4, sub(1) / 4, sub(2), sub(3), cv(4), cv(4)}, 0);
  579. auto y0 = opr::Reshape::make(x, tshp0);
  580. auto y1 = opr::Dimshuffle::make(y0, {0, 2, 4, 5, 3, 1});
  581. auto y2 = opr::Reshape::make(y1, tshp1);
  582. return y2.node();
  583. };
  584. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_GROUP] =
  585. [](VarNode* inp) -> VarNode* {
  586. auto x = SymbolVar(inp);
  587. auto xshp = opr::GetVarShape::make(x);
  588. auto cv = [&x](int v) { return x.make_scalar(v); };
  589. auto sub = [&xshp, &cv](int idx) {
  590. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  591. };
  592. auto tshp0 = opr::Concat::make({sub(0), sub(1) / 4, cv(4), sub(2) / 4,
  593. cv(4), sub(3), sub(4)},
  594. 0),
  595. tshp1 = opr::Concat::make({sub(0), sub(1) / 4, sub(2) / 4, sub(3),
  596. sub(4), cv(4), cv(4)},
  597. 0);
  598. auto y0 = opr::Reshape::make(x, tshp0);
  599. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 5, 6, 4, 2});
  600. auto y2 = opr::Reshape::make(y1, tshp1);
  601. return y2.node();
  602. };
  603. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_CHAN] =
  604. [](VarNode* inp) -> VarNode* {
  605. auto x = SymbolVar(inp);
  606. auto xshp = opr::GetVarShape::make(x);
  607. auto cv = [&x](int v) { return x.make_scalar(v); };
  608. auto sub = [&xshp, &cv](int idx) {
  609. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  610. };
  611. auto tshp0 = opr::Concat::make(
  612. {sub(0) / 4, cv(4), sub(1), sub(2), sub(3), sub(4)}, 0),
  613. tshp1 = opr::Concat::make(
  614. {sub(0) / 4, sub(1), sub(2), sub(3), sub(4), cv(4)}, 0);
  615. auto y0 = opr::Reshape::make(x, tshp0);
  616. auto y1 = opr::Dimshuffle::make(y0, {0, 2, 3, 4, 5, 1});
  617. auto y2 = opr::Reshape::make(y1, tshp1);
  618. return y2.node();
  619. };
  620. reformat[LayoutType::WEIGHT_HYBIRD_NCHW_NCHW44] =
  621. [](VarNode* inp) -> VarNode* {
  622. auto x = SymbolVar(inp);
  623. auto xshp = opr::GetVarShape::make(x);
  624. auto cv = [&x](int v) { return x.make_scalar(v); };
  625. auto sub = [&xshp, &cv](int idx) {
  626. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  627. };
  628. auto tshp0 = opr::Concat::make(
  629. {sub(0) / 4, cv(4), sub(1), sub(2), sub(3)}, 0),
  630. tshp1 = opr::Concat::make(
  631. {sub(0) / 4, sub(2), sub(3), sub(1), cv(4)}, 0);
  632. auto y0 = opr::Reshape::make(x, tshp0);
  633. auto y1 = opr::Dimshuffle::make(y0, {0, 3, 4, 2, 1});
  634. auto y2 = opr::Reshape::make(y1, tshp1);
  635. return y2.node();
  636. };
  637. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_DOT_DENSE] =
  638. [](VarNode* inp) -> VarNode* {
  639. auto x = SymbolVar(inp);
  640. auto xshp = opr::GetVarShape::make(x);
  641. auto cv = [&x](int v) { return x.make_scalar(v); };
  642. auto sub = [&xshp, &cv](int idx) {
  643. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  644. };
  645. auto tshp0 = opr::Concat::make(
  646. {sub(0) / 4, cv(4), sub(1) / 4, cv(4), sub(2), sub(3)}, 0),
  647. tshp1 = opr::Concat::make(
  648. {sub(0) / 4, sub(1) / 4, sub(2), sub(3), cv(4), cv(4)}, 0);
  649. auto y0 = opr::Reshape::make(x, tshp0);
  650. auto y1 = opr::Dimshuffle::make(y0, {0, 2, 4, 5, 1, 3});
  651. auto y2 = opr::Reshape::make(y1, tshp1);
  652. return y2.node();
  653. };
  654. reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_DOT_GROUP] =
  655. [](VarNode* inp) -> VarNode* {
  656. auto x = SymbolVar(inp);
  657. auto xshp = opr::GetVarShape::make(x);
  658. auto cv = [&x](int v) { return x.make_scalar(v); };
  659. auto sub = [&xshp, &cv](int idx) {
  660. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  661. };
  662. auto tshp0 = opr::Concat::make({sub(0), sub(1) / 4, cv(4), sub(2) / 4,
  663. cv(4), sub(3), sub(4)},
  664. 0),
  665. tshp1 = opr::Concat::make({sub(0), sub(1) / 4, sub(2) / 4, sub(3),
  666. sub(4), cv(4), cv(4)},
  667. 0);
  668. auto y0 = opr::Reshape::make(x, tshp0);
  669. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 5, 6, 2, 4});
  670. auto y2 = opr::Reshape::make(y1, tshp1);
  671. return y2.node();
  672. };
  673. auto rewriter = opt.graph().make_rewriter();
  674. auto on_opr = [&reformat, &rewriter](OperatorNodeBase* opr) {
  675. if (opr->same_type<RelayoutPlaceholder>()) {
  676. auto ph = try_cast_as_op<RelayoutPlaceholder>(opr);
  677. auto new_inp = rewriter.get_var(opr->input(0));
  678. mgb_assert(reformat.count(ph->layout_type()),
  679. "no replace rule can be found for layout_type(%u)",
  680. static_cast<uint32_t>(ph->layout_type()));
  681. auto new_var = reformat[ph->layout_type()](new_inp);
  682. rewriter.replace_var(opr->output(0), new_var,
  683. mgb_cstr_log("replace relayout placeholder"));
  684. return;
  685. }
  686. rewriter.auto_replace_outputs(opr);
  687. };
  688. opt.graph().iter(on_opr);
  689. rewriter.apply_inplace();
  690. }
  691. void TensorReformatPass::apply(OptState& opt) const {
  692. insert_pass(opt);
  693. translate_pass(opt);
  694. }
  695. /* ================ EnableTensorCorePass =============== */
  696. VarNode* EnableTensorCorePass::on_graph_endpoint_var(VarNode* new_var,
  697. VarNode* orig_var) const {
  698. if (!orig_var->shape().eq_shape(new_var->shape())) {
  699. return RelayoutPlaceholder::make(
  700. new_var,
  701. RelayoutPlaceholder::LayoutType::NCHW32_TO_NCHW4)
  702. .node();
  703. }
  704. return new_var;
  705. }
  706. std::unique_ptr<EnableTensorCorePass>
  707. EnableTensorCorePass::make_tensorcore_converter() {
  708. // replace rule for conv bias opr
  709. auto replace_conv_bias_opr = [](OperatorNodeBase* opr,
  710. const VarNodeArray& new_inp) {
  711. using Param = megdnn::param::ConvBias;
  712. using Format = Param::Format;
  713. using Sparse = Param::Sparse;
  714. mgb_assert(opr->input().size() == new_inp.size());
  715. auto& conv_bias = opr->cast_final_safe<opr::ConvBiasForward>();
  716. if (conv_bias.param().format != Format::NCHW4 ||
  717. conv_bias.output(0)->dtype().enumv() != DTypeEnum::QuantizedS8) {
  718. size_t nr_inps = opr->input().size();
  719. bool shape_has_changed = false;
  720. for (size_t i = 0; i < nr_inps; ++i) {
  721. if (!opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  722. shape_has_changed = true;
  723. }
  724. }
  725. MGB_MARK_USED_VAR(shape_has_changed);
  726. mgb_assert(
  727. !shape_has_changed,
  728. "EnableTensorCorePass assumes that the shape of inputs of"
  729. "ConvBias operators whose output dtype is not QuantizedS8 "
  730. "can not be changed in this opt pass");
  731. return serialization::copy_opr_shallow(*opr, new_inp,
  732. opr->config());
  733. }
  734. mgb_assert(opr->input(1)->shape().eq_shape(new_inp[1]->shape()),
  735. "EnableTensorCorePass assumes that filter tensor of "
  736. "conv_bias operator can not be changed by other operators");
  737. VarNode* orig_filter = opr->input(1);
  738. auto is_nchw4 = [](TensorShape shape) -> bool {
  739. return shape.ndim == 5 && shape[4] == 4;
  740. };
  741. auto is_nchw32 = [](TensorShape shape) -> bool {
  742. return shape.ndim == 5 && shape[4] == 32;
  743. };
  744. bool can_replace_nchw32 = false;
  745. VarNode *src = nullptr, *weight = nullptr, *bias = nullptr,
  746. *z_inp = nullptr;
  747. // process src tensor
  748. if (is_nchw4(new_inp[0]->shape())) { // new input is NCHW4 layout
  749. size_t group = 1, icpg, ocpg;
  750. if (conv_bias.param().sparse == Sparse::DENSE) {
  751. icpg = orig_filter->shape()[1] * 4;
  752. ocpg = orig_filter->shape()[0];
  753. } else {
  754. mgb_assert(conv_bias.param().sparse == Sparse::GROUP);
  755. group = orig_filter->shape()[0];
  756. icpg = orig_filter->shape()[2];
  757. ocpg = orig_filter->shape()[1];
  758. if (icpg == 1 && ocpg == 1) { // channel wise conv
  759. group *= 4;
  760. } else {
  761. icpg *= 4;
  762. }
  763. }
  764. // nchw32 layout need that input width and height are larger than 3
  765. size_t ih = new_inp[0]->shape()[2], iw = new_inp[0]->shape()[3];
  766. if (group == 1 && ocpg % 32 == 0 && icpg % 32 == 0 && ih >= 3 &&
  767. iw >= 3) {
  768. auto symvar = RelayoutPlaceholder::make(
  769. new_inp[0],
  770. RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW32);
  771. src = symvar.node();
  772. can_replace_nchw32 = true;
  773. } else {
  774. src = new_inp[0];
  775. }
  776. } else { // new input is NCHW32 layout
  777. mgb_assert(is_nchw32(new_inp[0]->shape()));
  778. size_t group = 1, ocpg;
  779. if (conv_bias.param().sparse == Sparse::DENSE) {
  780. ocpg = orig_filter->shape()[0];
  781. } else {
  782. mgb_assert(conv_bias.param().sparse == Sparse::GROUP);
  783. size_t icpg = orig_filter->shape()[2];
  784. ocpg = orig_filter->shape()[1];
  785. if (icpg == 1 && ocpg == 1) {
  786. group *= 4;
  787. } else {
  788. icpg *= 4;
  789. }
  790. }
  791. size_t ih = new_inp[0]->shape()[2], iw = new_inp[0]->shape()[3];
  792. if (group == 1 && ocpg % 32 == 0 && ih >= 3 && iw >= 3) {
  793. can_replace_nchw32 = true;
  794. src = new_inp[0];
  795. } else {
  796. auto symvar = RelayoutPlaceholder::make(
  797. new_inp[0],
  798. RelayoutPlaceholder::LayoutType::NCHW32_TO_NCHW4);
  799. src = symvar.node();
  800. }
  801. }
  802. // process filter tensor
  803. if (can_replace_nchw32) {
  804. auto symvar = RelayoutPlaceholder::make(
  805. new_inp[1],
  806. RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW32);
  807. weight = symvar.node();
  808. } else {
  809. weight = new_inp[1];
  810. }
  811. if (new_inp.size() == 2) {
  812. if (can_replace_nchw32) {
  813. auto param = conv_bias.param();
  814. param.format = Format::NCHW32;
  815. auto new_opr = opr::ConvBiasForward::make(
  816. src, weight, param, conv_bias.execution_policy(),
  817. conv_bias.config());
  818. return new_opr.node()->owner_opr();
  819. } else {
  820. VarNodeArray inps{src, weight};
  821. auto new_opr = serialization::copy_opr_shallow(*opr, inps,
  822. opr->config());
  823. return new_opr;
  824. }
  825. }
  826. auto process_inp = [&](VarNode* inp) -> VarNode* {
  827. if (can_replace_nchw32) {
  828. if (is_nchw4(inp->shape())) {
  829. auto symvar = RelayoutPlaceholder::make(
  830. inp,
  831. RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW32);
  832. return symvar.node();
  833. } else {
  834. mgb_assert(is_nchw32(inp->shape()));
  835. return inp;
  836. }
  837. } else {
  838. if (is_nchw4(inp->shape())) {
  839. return inp;
  840. } else {
  841. mgb_assert(is_nchw32(inp->shape()));
  842. auto symvar = RelayoutPlaceholder::make(
  843. inp,
  844. RelayoutPlaceholder::LayoutType::NCHW32_TO_NCHW4);
  845. return symvar.node();
  846. }
  847. }
  848. };
  849. // process bias tensor
  850. bias = process_inp(new_inp[2]);
  851. if (new_inp.size() == 3) {
  852. if (can_replace_nchw32) {
  853. auto param = conv_bias.param();
  854. param.format = Format::NCHW32;
  855. auto new_opr = opr::ConvBiasForward::make(
  856. src, weight, bias, param, conv_bias.execution_policy(),
  857. conv_bias.config());
  858. return new_opr.node()->owner_opr();
  859. } else {
  860. VarNodeArray inps{src, weight, bias};
  861. auto new_opr = serialization::copy_opr_shallow(*opr, inps,
  862. opr->config());
  863. return new_opr;
  864. }
  865. }
  866. // process z_inp tensor
  867. z_inp = process_inp(new_inp[3]);
  868. if (can_replace_nchw32) {
  869. auto param = conv_bias.param();
  870. param.format = Format::NCHW32;
  871. auto new_opr = opr::ConvBiasForward::make(
  872. src, weight, bias, z_inp, param,
  873. conv_bias.execution_policy(), conv_bias.config());
  874. return new_opr.node()->owner_opr();
  875. }
  876. VarNodeArray inps{src, weight, bias, z_inp};
  877. auto new_opr =
  878. serialization::copy_opr_shallow(*opr, inps, opr->config());
  879. return new_opr;
  880. };
  881. // replace rule for elemwise like opr
  882. // for oprs support NCHW4 and NCHW32 layout
  883. auto replace_elemwise_like_opr = [](OperatorNodeBase* opr,
  884. const VarNodeArray new_inp) {
  885. mgb_assert(opr->input().size() == new_inp.size());
  886. size_t nr_inps = new_inp.size();
  887. size_t nr_shape_changed = 0;
  888. for (size_t i = 0; i < nr_inps; ++i) {
  889. if (!opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  890. nr_shape_changed++;
  891. }
  892. }
  893. if (nr_shape_changed) {
  894. auto inps = new_inp;
  895. if (nr_shape_changed >=
  896. nr_inps / 2) { // NCHW32 > NCHW4 -> use NCHW32
  897. for (size_t i = 0; i < nr_inps; ++i) {
  898. if (opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  899. auto symvar = RelayoutPlaceholder::make(
  900. new_inp[i], RelayoutPlaceholder::LayoutType::
  901. NCHW4_TO_NCHW32);
  902. inps[i] = symvar.node();
  903. }
  904. }
  905. } else { // NCHW32 < NCHW4 -> use NCHW4
  906. for (size_t i = 0; i < nr_inps; ++i) {
  907. if (!opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  908. auto symvar = RelayoutPlaceholder::make(
  909. new_inp[i], RelayoutPlaceholder::LayoutType::
  910. NCHW32_TO_NCHW4);
  911. inps[i] = symvar.node();
  912. }
  913. }
  914. }
  915. return serialization::copy_opr_shallow(*opr, inps, opr->config());
  916. }
  917. return serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  918. };
  919. // for oprs only supports NCHW4 layout
  920. auto replace_inps_to_nchw4 = [](OperatorNodeBase* opr,
  921. const VarNodeArray new_inp) {
  922. mgb_assert(opr->input().size() == new_inp.size());
  923. VarNodeArray inps = new_inp;
  924. for (size_t i = 0; i < opr->input().size(); ++i) {
  925. if (!opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  926. mgb_assert(opr->input(i)->shape().ndim == 5 &&
  927. opr->input(i)->shape()[4] == 4);
  928. mgb_assert(new_inp[i]->shape().ndim == 5 &&
  929. new_inp[i]->shape()[4] == 32);
  930. auto symvar = RelayoutPlaceholder::make(
  931. new_inp[i],
  932. RelayoutPlaceholder::LayoutType::NCHW32_TO_NCHW4);
  933. inps[i] = symvar.node();
  934. }
  935. }
  936. auto new_opr =
  937. serialization::copy_opr_shallow(*opr, inps, opr->config());
  938. return new_opr;
  939. };
  940. auto replace_non_nchw4_opr = [](OperatorNodeBase* opr,
  941. const VarNodeArray new_inp) {
  942. size_t nr_inps = opr->input().size();
  943. bool shape_has_changed = false;
  944. for (size_t i = 0; i < nr_inps; ++i) {
  945. if (!opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  946. shape_has_changed = true;
  947. }
  948. }
  949. mgb_assert(!shape_has_changed,
  950. "EnableTensorCorePass assumes that inputs' shape of "
  951. "non-nchw4 operators "
  952. "can not be changed in this opt "
  953. "pass");
  954. return serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  955. };
  956. auto replace_warp_affine_opr =
  957. [replace_inps_to_nchw4, replace_non_nchw4_opr](
  958. OperatorNodeBase* opr, const VarNodeArray new_inp) {
  959. using Param = opr::WarpAffineForward::Param;
  960. using Format = Param::Format;
  961. mgb_assert(opr->input().size() == new_inp.size());
  962. auto& warp = opr->cast_final_safe<opr::WarpAffineForward>();
  963. if (warp.param().format != Format::NCHW4) {
  964. return replace_non_nchw4_opr(opr, new_inp);
  965. }
  966. return replace_inps_to_nchw4(opr, new_inp);
  967. };
  968. auto replace_warp_perspective_opr =
  969. [replace_inps_to_nchw4, replace_non_nchw4_opr](
  970. OperatorNodeBase* opr, const VarNodeArray new_inp) {
  971. using Param = opr::WarpPerspectiveForward::Param;
  972. using Format = Param::Format;
  973. mgb_assert(opr->input().size() == new_inp.size());
  974. auto& warp =
  975. opr->cast_final_safe<opr::WarpPerspectiveForward>();
  976. if (warp.param().format != Format::NCHW4) {
  977. return replace_non_nchw4_opr(opr, new_inp);
  978. }
  979. return replace_inps_to_nchw4(opr, new_inp);
  980. };
  981. auto replace_resize_opr = [replace_inps_to_nchw4, replace_non_nchw4_opr](
  982. OperatorNodeBase* opr,
  983. const VarNodeArray new_inp) {
  984. using Param = opr::ResizeForward::Param;
  985. using Format = Param::Format;
  986. mgb_assert(opr->input().size() == new_inp.size());
  987. auto& resize = opr->cast_final_safe<opr::ResizeForward>();
  988. if (resize.param().format != Format::NCHW4) {
  989. return replace_non_nchw4_opr(opr, new_inp);
  990. }
  991. return replace_inps_to_nchw4(opr, new_inp);
  992. };
  993. auto replace_pooling_opr = [replace_non_nchw4_opr](
  994. OperatorNodeBase* opr,
  995. const VarNodeArray new_inp) {
  996. using Param = opr::PoolingForward::Param;
  997. using Format = Param::Format;
  998. mgb_assert(opr->input().size() == new_inp.size());
  999. auto& pooling = opr->cast_final_safe<opr::PoolingForward>();
  1000. if (pooling.param().format != Format::NCHW4) {
  1001. return replace_non_nchw4_opr(opr, new_inp);
  1002. }
  1003. size_t nr_inps = opr->input().size();
  1004. MGB_MARK_USED_VAR(nr_inps);
  1005. mgb_assert(nr_inps == 1);
  1006. if (!opr->input(0)->shape().eq_shape(new_inp[0]->shape())) {
  1007. mgb_assert(opr->input(0)->shape().ndim == 5 &&
  1008. opr->input(0)->shape()[4] == 4);
  1009. mgb_assert(new_inp[0]->shape().ndim == 5 &&
  1010. new_inp[0]->shape()[4] == 32);
  1011. auto new_param = pooling.param();
  1012. new_param.format = Format::NCHW32;
  1013. auto new_pooling = opr::PoolingForward::make(new_inp[0], new_param,
  1014. opr->config());
  1015. return new_pooling.node()->owner_opr();
  1016. }
  1017. return serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  1018. };
  1019. auto ret = std::make_unique<EnableTensorCorePass>();
  1020. ret->set_var_replace_check_flag(VarReplaceCheckFlag::NOCHECK);
  1021. auto&& replace_func = ret->m_opr_replace_func;
  1022. replace_func[opr::ConvBiasForward::typeinfo()] = replace_conv_bias_opr;
  1023. // elemwise like
  1024. replace_func[opr::Elemwise::typeinfo()] = replace_elemwise_like_opr;
  1025. replace_func[opr::TypeCvt::typeinfo()] = replace_elemwise_like_opr;
  1026. replace_func[opr::ElemwiseMultiType::typeinfo()] =
  1027. replace_elemwise_like_opr;
  1028. replace_func[opr::PowC::typeinfo()] = replace_elemwise_like_opr;
  1029. // format aware
  1030. replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr;
  1031. replace_func[opr::WarpAffineForward::typeinfo()] = replace_warp_affine_opr;
  1032. replace_func[opr::WarpPerspectiveForward::typeinfo()] =
  1033. replace_warp_perspective_opr;
  1034. replace_func[opr::ResizeForward::typeinfo()] = replace_resize_opr;
  1035. // to nchw4
  1036. replace_func[opr::Reduce::typeinfo()] = replace_inps_to_nchw4;
  1037. replace_func[opr::Concat::typeinfo()] = replace_inps_to_nchw4;
  1038. replace_func[opr::Reshape::typeinfo()] = replace_inps_to_nchw4;
  1039. replace_func[opr::GetVarShape::typeinfo()] = replace_inps_to_nchw4;
  1040. replace_func[opr::Dimshuffle::typeinfo()] = replace_inps_to_nchw4;
  1041. return ret;
  1042. }
  1043. /* ================ EnableCHWN4Pass =============== */
  1044. VarNode* EnableCHWN4Pass::on_graph_endpoint_var(VarNode* new_var,
  1045. VarNode* /* orig_var */) const {
  1046. if (m_varshape_changed.count(new_var)) {
  1047. return RelayoutPlaceholder::make(
  1048. new_var, RelayoutPlaceholder::LayoutType::CHWN4_TO_NCHW4)
  1049. .node();
  1050. }
  1051. return new_var;
  1052. }
  1053. std::unique_ptr<EnableCHWN4Pass> EnableCHWN4Pass::make_chwn4_converter() {
  1054. auto ret = std::make_unique<EnableCHWN4Pass>();
  1055. ret->set_var_replace_check_flag(VarReplaceCheckFlag::NOCHECK);
  1056. auto&& replace_func = ret->m_opr_replace_func;
  1057. auto&& varshape_changed = ret->m_varshape_changed;
  1058. // replace rule for conv bias opr
  1059. auto replace_conv_bias_opr = [&varshape_changed](
  1060. OperatorNodeBase* opr,
  1061. const VarNodeArray& new_inp) {
  1062. using Param = megdnn::param::ConvBias;
  1063. using Format = Param::Format;
  1064. mgb_assert(opr->input().size() == new_inp.size());
  1065. auto& conv_bias = opr->cast_final_safe<opr::ConvBiasForward>();
  1066. if (conv_bias.param().format != Format::NCHW4 ||
  1067. conv_bias.output(0)->dtype().enumv() != DTypeEnum::QuantizedS8) {
  1068. size_t nr_inps = new_inp.size();
  1069. bool shape_has_changed = false;
  1070. for (size_t i = 0; i < nr_inps; ++i) {
  1071. if (varshape_changed.count(new_inp[i])) {
  1072. shape_has_changed = true;
  1073. break;
  1074. }
  1075. }
  1076. mgb_assert(
  1077. !shape_has_changed,
  1078. "EnableCHWN4Pass assumes that the shape of inputs of"
  1079. "ConvBias operators whose output dtype is not QuantizedS8 "
  1080. "can not be changed in this opt pass");
  1081. return serialization::copy_opr_shallow(*opr, new_inp,
  1082. opr->config());
  1083. }
  1084. mgb_assert(varshape_changed.count(new_inp[1]) == 0,
  1085. "EnableCHWN4Pass assumes that filter tensor of "
  1086. "conv_bias operator can not be changed by other operators");
  1087. VarNode *src = nullptr, *weight = nullptr, *bias = nullptr,
  1088. *z_inp = nullptr;
  1089. // process src tensor
  1090. if (varshape_changed.count(new_inp[0]) ==
  1091. 0) { // new input is NCHW4 layout
  1092. // currently not support group conv
  1093. auto symvar = RelayoutPlaceholder::make(
  1094. new_inp[0],
  1095. RelayoutPlaceholder::LayoutType::NCHW4_TO_CHWN4);
  1096. src = symvar.node();
  1097. } else { // new input is NCHW32 layout
  1098. src = new_inp[0];
  1099. }
  1100. // process weight tensor
  1101. {
  1102. auto symvar = RelayoutPlaceholder::make(
  1103. new_inp[1],
  1104. RelayoutPlaceholder::LayoutType::NCHW4_TO_CHWN4);
  1105. weight = symvar.node();
  1106. }
  1107. if (new_inp.size() == 2) {
  1108. auto param = conv_bias.param();
  1109. param.format = Format::CHWN4;
  1110. auto new_opr = opr::ConvBiasForward::make(
  1111. src, weight, param, conv_bias.execution_policy(),
  1112. conv_bias.config());
  1113. varshape_changed.insert(new_opr.node());
  1114. return new_opr.node()->owner_opr();
  1115. }
  1116. auto process_inp = [&](VarNode* inp) -> VarNode* {
  1117. if (varshape_changed.count(inp) == 0) {
  1118. auto symvar = RelayoutPlaceholder::make(
  1119. inp, RelayoutPlaceholder::LayoutType::NCHW4_TO_CHWN4);
  1120. return symvar.node();
  1121. } else {
  1122. return inp;
  1123. }
  1124. };
  1125. // process bias tensor
  1126. bias = process_inp(new_inp[2]);
  1127. if (new_inp.size() == 3) {
  1128. auto param = conv_bias.param();
  1129. param.format = Format::CHWN4;
  1130. auto new_opr = opr::ConvBiasForward::make(
  1131. src, weight, bias, param, conv_bias.execution_policy(),
  1132. conv_bias.config());
  1133. varshape_changed.insert(new_opr.node());
  1134. return new_opr.node()->owner_opr();
  1135. }
  1136. // process z_inp tensor
  1137. z_inp = process_inp(new_inp[3]);
  1138. auto param = conv_bias.param();
  1139. param.format = Format::CHWN4;
  1140. auto new_opr = opr::ConvBiasForward::make(
  1141. src, weight, bias, z_inp, param, conv_bias.execution_policy(),
  1142. conv_bias.config());
  1143. varshape_changed.insert(new_opr.node());
  1144. return new_opr.node()->owner_opr();
  1145. };
  1146. // replace rule for elemwise like opr
  1147. // for oprs support NCHW4 and CHWN4 layout
  1148. auto replace_elemwise_like_opr = [&varshape_changed](
  1149. OperatorNodeBase* opr,
  1150. const VarNodeArray new_inp) {
  1151. mgb_assert(opr->input().size() == new_inp.size());
  1152. size_t nr_inps = new_inp.size();
  1153. size_t nr_shape_changed = 0;
  1154. for (size_t i = 0; i < nr_inps; ++i) {
  1155. if (varshape_changed.count(new_inp[i])) {
  1156. nr_shape_changed++;
  1157. }
  1158. }
  1159. if (nr_shape_changed) {
  1160. auto inps = new_inp;
  1161. if (nr_shape_changed >= nr_inps / 2) { // CHWN4 > NCHW4 -> use CHWN4
  1162. for (size_t i = 0; i < nr_inps; ++i) {
  1163. if (varshape_changed.count(new_inp[i]) == 0) {
  1164. auto symvar = RelayoutPlaceholder::make(
  1165. new_inp[i], RelayoutPlaceholder::LayoutType::
  1166. NCHW4_TO_CHWN4);
  1167. inps[i] = symvar.node();
  1168. }
  1169. }
  1170. auto new_opr = serialization::copy_opr_shallow(*opr, inps,
  1171. opr->config());
  1172. varshape_changed.insert(new_opr->output(0));
  1173. return new_opr;
  1174. } else { // CHWN4 < NCHW4 -> use NCHW4
  1175. for (size_t i = 0; i < nr_inps; ++i) {
  1176. if (varshape_changed.count(new_inp[i])) {
  1177. auto symvar = RelayoutPlaceholder::make(
  1178. new_inp[i], RelayoutPlaceholder::LayoutType::
  1179. CHWN4_TO_NCHW4);
  1180. inps[i] = symvar.node();
  1181. }
  1182. }
  1183. return serialization::copy_opr_shallow(*opr, inps,
  1184. opr->config());
  1185. }
  1186. }
  1187. return serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  1188. };
  1189. // for oprs only supports NCHW4 layout
  1190. auto replace_inps_to_nchw4 = [&varshape_changed](
  1191. OperatorNodeBase* opr,
  1192. const VarNodeArray new_inp) {
  1193. mgb_assert(opr->input().size() == new_inp.size());
  1194. VarNodeArray inps = new_inp;
  1195. for (size_t i = 0; i < opr->input().size(); ++i) {
  1196. if (varshape_changed.count(new_inp[i])) {
  1197. auto symvar = RelayoutPlaceholder::make(
  1198. new_inp[i],
  1199. RelayoutPlaceholder::LayoutType::CHWN4_TO_NCHW4);
  1200. inps[i] = symvar.node();
  1201. }
  1202. }
  1203. auto new_opr =
  1204. serialization::copy_opr_shallow(*opr, inps, opr->config());
  1205. return new_opr;
  1206. };
  1207. auto replace_non_nchw4_opr = [&varshape_changed](
  1208. OperatorNodeBase* opr,
  1209. const VarNodeArray new_inp) {
  1210. size_t nr_inps = opr->input().size();
  1211. bool shape_has_changed = false;
  1212. for (size_t i = 0; i < nr_inps; ++i) {
  1213. if (varshape_changed.count(new_inp[i])) {
  1214. shape_has_changed = true;
  1215. }
  1216. }
  1217. mgb_assert(!shape_has_changed,
  1218. "EnableCHWN4Pass assumes that inputs' shape of "
  1219. "non-nchw4 operators "
  1220. "can not be changed in this opt "
  1221. "pass");
  1222. return serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  1223. };
  1224. // capture by copy to avoid use after return
  1225. auto replace_warp_affine_opr =
  1226. [replace_inps_to_nchw4, replace_non_nchw4_opr](
  1227. OperatorNodeBase* opr, const VarNodeArray new_inp) {
  1228. using Param = opr::WarpAffineForward::Param;
  1229. using Format = Param::Format;
  1230. mgb_assert(opr->input().size() == new_inp.size());
  1231. auto& warp = opr->cast_final_safe<opr::WarpAffineForward>();
  1232. if (warp.param().format != Format::NCHW4) {
  1233. return replace_non_nchw4_opr(opr, new_inp);
  1234. }
  1235. return replace_inps_to_nchw4(opr, new_inp);
  1236. };
  1237. auto replace_warp_perspective_opr =
  1238. [replace_inps_to_nchw4, replace_non_nchw4_opr](
  1239. OperatorNodeBase* opr, const VarNodeArray new_inp) {
  1240. using Param = opr::WarpPerspectiveForward::Param;
  1241. using Format = Param::Format;
  1242. mgb_assert(opr->input().size() == new_inp.size());
  1243. auto& warp =
  1244. opr->cast_final_safe<opr::WarpPerspectiveForward>();
  1245. if (warp.param().format != Format::NCHW4) {
  1246. return replace_non_nchw4_opr(opr, new_inp);
  1247. }
  1248. return replace_inps_to_nchw4(opr, new_inp);
  1249. };
  1250. auto replace_resize_opr = [replace_inps_to_nchw4, replace_non_nchw4_opr](
  1251. OperatorNodeBase* opr,
  1252. const VarNodeArray new_inp) {
  1253. using Param = opr::ResizeForward::Param;
  1254. using Format = Param::Format;
  1255. mgb_assert(opr->input().size() == new_inp.size());
  1256. auto& resize = opr->cast_final_safe<opr::ResizeForward>();
  1257. if (resize.param().format != Format::NCHW4) {
  1258. return replace_non_nchw4_opr(opr, new_inp);
  1259. }
  1260. return replace_inps_to_nchw4(opr, new_inp);
  1261. };
  1262. auto replace_pooling_opr = [&varshape_changed, replace_non_nchw4_opr](
  1263. OperatorNodeBase* opr,
  1264. const VarNodeArray new_inp) {
  1265. using Param = opr::PoolingForward::Param;
  1266. using Format = Param::Format;
  1267. mgb_assert(opr->input().size() == new_inp.size());
  1268. auto& pooling = opr->cast_final_safe<opr::PoolingForward>();
  1269. if (pooling.param().format != Format::NCHW4) {
  1270. return replace_non_nchw4_opr(opr, new_inp);
  1271. }
  1272. size_t nr_inps = opr->input().size();
  1273. MGB_MARK_USED_VAR(nr_inps);
  1274. mgb_assert(nr_inps == 1);
  1275. if (varshape_changed.count(new_inp[0])) {
  1276. auto new_param = pooling.param();
  1277. new_param.format = Format::CHWN4;
  1278. auto new_pooling = opr::PoolingForward::make(new_inp[0], new_param,
  1279. opr->config());
  1280. varshape_changed.insert(new_pooling.node());
  1281. return new_pooling.node()->owner_opr();
  1282. }
  1283. return serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  1284. };
  1285. replace_func[opr::ConvBiasForward::typeinfo()] = replace_conv_bias_opr;
  1286. // elemwise like
  1287. replace_func[opr::Elemwise::typeinfo()] = replace_elemwise_like_opr;
  1288. replace_func[opr::TypeCvt::typeinfo()] = replace_elemwise_like_opr;
  1289. replace_func[opr::ElemwiseMultiType::typeinfo()] =
  1290. replace_elemwise_like_opr;
  1291. replace_func[opr::PowC::typeinfo()] = replace_elemwise_like_opr;
  1292. // format aware
  1293. replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr;
  1294. replace_func[opr::WarpAffineForward::typeinfo()] = replace_warp_affine_opr;
  1295. replace_func[opr::WarpPerspectiveForward::typeinfo()] =
  1296. replace_warp_perspective_opr;
  1297. replace_func[opr::ResizeForward::typeinfo()] = replace_resize_opr;
  1298. // to nchw4
  1299. replace_func[opr::Reduce::typeinfo()] = replace_inps_to_nchw4;
  1300. replace_func[opr::Concat::typeinfo()] = replace_inps_to_nchw4;
  1301. replace_func[opr::Reshape::typeinfo()] = replace_inps_to_nchw4;
  1302. replace_func[opr::GetVarShape::typeinfo()] = replace_inps_to_nchw4;
  1303. replace_func[opr::Dimshuffle::typeinfo()] = replace_inps_to_nchw4;
  1304. replace_func[opr::BatchConvBias::typeinfo()] = replace_inps_to_nchw4;
  1305. return ret;
  1306. }
  1307. /* ================ EnableNCHW4Pass ================ */
  1308. VarNode* EnableNCHW4Pass::on_graph_endpoint_var(VarNode* new_var,
  1309. VarNode* orig_var) const {
  1310. if (!orig_var->shape().eq_shape(new_var->shape())) {
  1311. return RelayoutPlaceholder::make(
  1312. new_var, RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW)
  1313. .node();
  1314. }
  1315. return new_var;
  1316. }
  1317. std::unique_ptr<EnableNCHW4Pass> EnableNCHW4Pass::make_nchw4_converter(){
  1318. auto ret = std::make_unique<EnableNCHW4Pass>();
  1319. ret->set_var_replace_check_flag(VarReplaceCheckFlag::NOCHECK);
  1320. using RelayoutMode = RelayoutPlaceholder::LayoutType;
  1321. megdnn::param::Convolution::Format conv_format =
  1322. megdnn::param::Convolution::Format::NCHW4;
  1323. megdnn::param::ConvBias::Format conv_bias_format =
  1324. megdnn::param::ConvBias::Format::NCHW4;
  1325. megdnn::param::BatchConvBias::Format batch_conv_bias_format =
  1326. megdnn::param::BatchConvBias::Format::NCHW4;
  1327. RelayoutMode src_to_nchw4_mode = RelayoutMode::NCHW_TO_NCHW4;
  1328. RelayoutMode src_to_nchw_mode = RelayoutMode::NCHW4_TO_NCHW;
  1329. RelayoutMode weight_to_nchw4_mode_dense =
  1330. RelayoutMode::WEIGHT_NCHW_TO_NCHW4_DENSE;
  1331. RelayoutMode weight_to_nchw4_mode_group =
  1332. RelayoutMode::WEIGHT_NCHW_TO_NCHW4_GROUP;
  1333. auto trans_nchw4 = [weight_to_nchw4_mode_dense,
  1334. weight_to_nchw4_mode_group](
  1335. const megdnn::param::Convolution::Sparse conv_mode,
  1336. const VarNode* filter) -> RelayoutMode {
  1337. if (conv_mode == megdnn::param::Convolution::Sparse::DENSE) {
  1338. mgb_assert(filter->shape().ndim == 4,
  1339. "The origin filter is not NCHW mode");
  1340. size_t IC = filter->shape()[1];
  1341. mgb_assert(IC % 4 == 0,
  1342. "The input channel should be divisible by 4");
  1343. return weight_to_nchw4_mode_dense;
  1344. } else {
  1345. mgb_assert(conv_mode == megdnn::param::Convolution::Sparse::GROUP);
  1346. mgb_assert(filter->shape().ndim == 5,
  1347. "The origin filter if not NCHW mode");
  1348. size_t IC = filter->shape()[2];
  1349. mgb_assert(IC % 4 == 0,
  1350. "The input channel should be divisible by 4");
  1351. return weight_to_nchw4_mode_group;
  1352. }
  1353. };
  1354. auto replace_conv_opr = [trans_nchw4, conv_format, src_to_nchw4_mode](
  1355. OperatorNodeBase* opr, const VarNodeArray& new_inp) {
  1356. mgb_assert(opr->input().size() == new_inp.size());
  1357. auto& conv_opr = opr->cast_final_safe<opr::ConvolutionForward>();
  1358. mgb_assert(conv_opr.param().format ==
  1359. megdnn::param::Convolution::Format::NCHW,
  1360. "ConvertFormat Pass only support converting NCHW to NCHW4");
  1361. VarNode *conv_src = new_inp[0], *conv_filter = new_inp[1];
  1362. // src: NCHW --> NCWH4
  1363. if (new_inp[0]->shape().ndim != 5) {
  1364. mgb_assert(new_inp[0]->shape().ndim == 4);
  1365. auto new_src = RelayoutPlaceholder::make(new_inp[0],
  1366. src_to_nchw4_mode);
  1367. conv_src = new_src.node();
  1368. }
  1369. // weight: NCHW --> NCHW4
  1370. auto weight_mode =
  1371. trans_nchw4(conv_opr.param().sparse, new_inp[1]);
  1372. auto new_filter = RelayoutPlaceholder::make(new_inp[1], weight_mode);
  1373. conv_filter = new_filter.node();
  1374. // format: NCHW --> NCHW4
  1375. auto new_param = conv_opr.param();
  1376. new_param.format = conv_format;
  1377. // dst
  1378. auto new_conv_opr = opr::Convolution::make(
  1379. conv_src, conv_filter, new_param,
  1380. conv_opr.execution_policy(), conv_opr.config());
  1381. OperatorNodeBase* new_opr = new_conv_opr.node()->owner_opr();
  1382. mgb_assert(new_conv_opr.shape().ndim == 5,
  1383. "The conv dst dim is not trans to nchw4");
  1384. return new_opr;
  1385. };
  1386. auto replace_batch_conv_bias_opr = [batch_conv_bias_format,
  1387. src_to_nchw4_mode](
  1388. OperatorNodeBase* opr,
  1389. const VarNodeArray& new_inp) {
  1390. mgb_assert(opr->input().size() == new_inp.size());
  1391. auto& batch_conv_bias_opr =
  1392. opr->cast_final_safe<opr::BatchConvBiasForward>();
  1393. mgb_assert(batch_conv_bias_opr.param().format ==
  1394. megdnn::param::BatchConvBias::Format::NCHW,
  1395. "ConvertFormat Pass only support converting NCHW to NCHW4");
  1396. // what should be converted: src, weight
  1397. VarNode *src = new_inp[0], *filter = new_inp[1];
  1398. // src: NCHW --> NCHW4
  1399. if (new_inp[0]->shape().ndim !=5) {
  1400. mgb_assert(new_inp[0]->shape().ndim == 4);
  1401. auto new_src = RelayoutPlaceholder::make(new_inp[0],
  1402. src_to_nchw4_mode);
  1403. src = new_src.node();
  1404. }
  1405. // weight: BNCHW --> BNCHW4
  1406. // only support dense mode, which is similar with conv->group.
  1407. auto weight_mode =
  1408. RelayoutPlaceholder::LayoutType::WEIGHT_NCHW_TO_NCHW4_GROUP;
  1409. auto new_filter = RelayoutPlaceholder::make(new_inp[1], weight_mode);
  1410. filter = new_filter.node();
  1411. // format: NCHW --> NCHW4
  1412. auto new_param = batch_conv_bias_opr.param();
  1413. new_param.format = batch_conv_bias_format;
  1414. if (new_inp.size() == 2) {
  1415. auto dst = opr::BatchConvBias::make(src, filter, new_param,
  1416. batch_conv_bias_opr.execution_policy(),
  1417. batch_conv_bias_opr.config());
  1418. OperatorNodeBase* new_opr = dst.node()->owner_opr();
  1419. mgb_assert(dst.shape().ndim == 5,
  1420. "The conv_bias dst dim is not trans to nchw4");
  1421. return new_opr;
  1422. }
  1423. // bias: NCHW --> NCHW4
  1424. VarNode* bias = new_inp[2];
  1425. if (new_inp[2]->shape().ndim == 4) {
  1426. auto new_bias = RelayoutPlaceholder::make(new_inp[2],
  1427. src_to_nchw4_mode);
  1428. bias = new_bias.node();
  1429. }
  1430. if (new_inp.size() == 3) {
  1431. auto dst = opr::BatchConvBias::make(src, filter, bias, new_param,
  1432. batch_conv_bias_opr.execution_policy(),
  1433. batch_conv_bias_opr.config());
  1434. OperatorNodeBase* new_opr = dst.node()->owner_opr();
  1435. mgb_assert(dst.shape().ndim == 5,
  1436. "The conv_bias dst dim is not trans to nchw4");
  1437. return new_opr;
  1438. }
  1439. // z_inp: NCHW --> NCHW4
  1440. VarNode* z_inp = new_inp[3];
  1441. if (new_inp[3]->shape().ndim == 4) {
  1442. auto new_z = RelayoutPlaceholder::make(new_inp[3],
  1443. src_to_nchw4_mode);
  1444. z_inp = new_z.node();
  1445. }
  1446. auto dst = opr::BatchConvBias::make(src, filter, bias, z_inp,
  1447. new_param,batch_conv_bias_opr.execution_policy(),
  1448. batch_conv_bias_opr.config());
  1449. OperatorNodeBase* new_opr = dst.node()->owner_opr();
  1450. mgb_assert(dst.shape().ndim == 5,
  1451. "The conv_bias dst dim is not trans to nchw4");
  1452. return new_opr;
  1453. };
  1454. auto replace_conv_bias_opr = [trans_nchw4, conv_bias_format,
  1455. src_to_nchw4_mode](
  1456. OperatorNodeBase* opr,
  1457. const VarNodeArray& new_inp) {
  1458. mgb_assert(opr->input().size() == new_inp.size());
  1459. auto& conv_bias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
  1460. mgb_assert(conv_bias_opr.param().format ==
  1461. megdnn::param::ConvBias::Format::NCHW,
  1462. "ConvertFormat Pass only support converting NCHW to NCHW4");
  1463. // what should be converted: src, weight
  1464. VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1];
  1465. // src: NCHW --> NCHW4
  1466. if (new_inp[0]->shape().ndim !=5) {
  1467. mgb_assert(new_inp[0]->shape().ndim == 4);
  1468. auto new_src = RelayoutPlaceholder::make(new_inp[0],
  1469. src_to_nchw4_mode);
  1470. conv_bias_src = new_src.node();
  1471. }
  1472. // weight: NCHW --> NCHW4 or GNCHW --> GNCHW4
  1473. auto weight_mode =
  1474. trans_nchw4(conv_bias_opr.param().sparse, new_inp[1]);
  1475. auto new_filter = RelayoutPlaceholder::make(new_inp[1], weight_mode);
  1476. conv_bias_filter = new_filter.node();
  1477. // format: NCHW --> NCHW4
  1478. auto new_param = conv_bias_opr.param();
  1479. new_param.format = conv_bias_format;
  1480. if (new_inp.size() == 2) {
  1481. auto new_conv_bias_opr = opr::ConvBias::make(
  1482. conv_bias_src, conv_bias_filter, new_param,
  1483. conv_bias_opr.execution_policy(), conv_bias_opr.config());
  1484. OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
  1485. mgb_assert(new_conv_bias_opr.shape().ndim == 5,
  1486. "The conv_bias dst dim is not trans to nchw4");
  1487. return new_opr;
  1488. }
  1489. // bias: NCHW --> NCHW4
  1490. VarNode* conv_bias_bias = new_inp[2];
  1491. if (new_inp[2]->shape().ndim == 4) {
  1492. auto new_bias = RelayoutPlaceholder::make(new_inp[2],
  1493. src_to_nchw4_mode);
  1494. conv_bias_bias = new_bias.node();
  1495. }
  1496. if (new_inp.size() == 3) {
  1497. auto new_conv_bias_opr = opr::ConvBias::make(
  1498. conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
  1499. conv_bias_opr.execution_policy(), conv_bias_opr.config());
  1500. OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
  1501. mgb_assert(new_conv_bias_opr.shape().ndim == 5,
  1502. "The conv_bias dst dim is not trans to nchw4");
  1503. return new_opr;
  1504. }
  1505. // z_inp: NCHW --> NCHW4
  1506. VarNode* z_inp = new_inp[3];
  1507. if (new_inp[3]->shape().ndim == 4) {
  1508. auto new_z = RelayoutPlaceholder::make(new_inp[3],
  1509. src_to_nchw4_mode);
  1510. z_inp = new_z.node();
  1511. }
  1512. auto new_conv_bias_opr = opr::ConvBias::make(conv_bias_src,
  1513. conv_bias_filter, conv_bias_bias, z_inp, new_param,
  1514. conv_bias_opr.execution_policy(), conv_bias_opr.config());
  1515. OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
  1516. mgb_assert(new_conv_bias_opr.shape().ndim == 5,
  1517. "The conv_bias dst dim is not trans to nchw4");
  1518. return new_opr;
  1519. };
  1520. auto replace_elemwise_opr = [=](OperatorNodeBase* opr,
  1521. const VarNodeArray& new_inp) {
  1522. mgb_assert(opr->input().size() == new_inp.size());
  1523. bool has_inp_changed = false;
  1524. for (size_t i = 0; i < opr->input().size(); i++) {
  1525. if (new_inp[i]->shape().ndim == 5) {
  1526. has_inp_changed = true;
  1527. break;
  1528. }
  1529. }
  1530. if (has_inp_changed) {
  1531. auto temp_inp = new_inp;
  1532. for (size_t i = 0; i < opr->input().size(); i++) {
  1533. if (new_inp[i]->shape().ndim == 4) {
  1534. auto new_var = RelayoutPlaceholder::make(
  1535. new_inp[i], src_to_nchw4_mode);
  1536. temp_inp[i] = new_var.node();
  1537. } else {
  1538. mgb_assert((new_inp[i]->shape().ndim == 5) ||
  1539. new_inp[i]->shape().is_scalar());
  1540. }
  1541. }
  1542. return serialization::copy_opr_shallow(*opr, temp_inp,
  1543. opr->config());
  1544. } else {
  1545. return serialization::copy_opr_shallow(*opr, new_inp,
  1546. opr->config());
  1547. }
  1548. };
  1549. auto relayout_inp_to_nchw = [=](OperatorNodeBase* opr,
  1550. const VarNodeArray& new_inp) {
  1551. mgb_assert(opr->input().size() == new_inp.size());
  1552. VarNodeArray temp_inp = new_inp;
  1553. for (size_t i = 0; i < opr->input().size(); i++) {
  1554. if (!opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  1555. mgb_assert(opr->input(i)->shape().ndim == 4);
  1556. mgb_assert(new_inp[i]->shape().ndim == 5);
  1557. auto new_var =
  1558. RelayoutPlaceholder::make(new_inp[i], src_to_nchw_mode);
  1559. temp_inp[i] = new_var.node();
  1560. }
  1561. }
  1562. return serialization::copy_opr_shallow(*opr, temp_inp, opr->config());
  1563. };
  1564. auto replace_pooling_opr = [](OperatorNodeBase* opr,
  1565. const VarNodeArray& new_inp) {
  1566. using Param = opr::PoolingForward::Param;
  1567. using Format = Param::Format;
  1568. mgb_assert(opr->input().size() == new_inp.size());
  1569. auto& pooling = opr->cast_final_safe<opr::PoolingForward>();
  1570. mgb_assert(pooling.param().format == Format::NCHW,
  1571. "ConvertFormat Pass only support converting NCHW to NCHW4.");
  1572. if (new_inp[0]->shape().ndim == 5) {
  1573. mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8);
  1574. auto new_param = pooling.param();
  1575. new_param.format = Format::NCHW4;
  1576. auto new_pooling =
  1577. opr::PoolingForward::make(new_inp[0], new_param, opr->config());
  1578. mgb_assert(new_pooling.shape().ndim == 5,
  1579. "out var of Pooling opr after transform must be 5 (got: "
  1580. "%zu).",
  1581. new_pooling.shape().ndim);
  1582. return new_pooling.node()->owner_opr();
  1583. }
  1584. auto new_opr =
  1585. serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  1586. return new_opr;
  1587. };
  1588. auto replace_resize_opr = [](OperatorNodeBase* opr,
  1589. const VarNodeArray& new_inp) {
  1590. using Param = opr::ResizeForward::Param;
  1591. using Format = Param::Format;
  1592. mgb_assert(opr->input().size() == new_inp.size());
  1593. auto& resize = opr->cast_final_safe<opr::ResizeForward>();
  1594. mgb_assert(resize.param().format == Format::NCHW,
  1595. "ConvertFormat Pass only support converting NCHW to NCHW4.");
  1596. if (new_inp[0]->shape().ndim == 5) {
  1597. mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8);
  1598. auto new_param = resize.param();
  1599. new_param.format = Format::NCHW4;
  1600. auto new_resize = opr::ResizeForward::make(
  1601. new_inp[0], new_inp[1], new_param, opr->config());
  1602. mgb_assert(new_resize.shape().ndim == 5,
  1603. "out var of Resize opr after transform must be 5 (got: "
  1604. "%zu).",
  1605. new_resize.shape().ndim);
  1606. return new_resize.node()->owner_opr();
  1607. }
  1608. auto new_opr =
  1609. serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  1610. return new_opr;
  1611. };
  1612. auto replace_warp_perspective_opr = [](OperatorNodeBase* opr,
  1613. const VarNodeArray& new_inp) {
  1614. using Param = opr::WarpPerspective::Param;
  1615. using Format = Param::Format;
  1616. mgb_assert(opr->input().size() == new_inp.size());
  1617. auto& warp = opr->cast_final_safe<opr::WarpPerspectiveForward>();
  1618. mgb_assert(warp.param().format == Format::NCHW,
  1619. "ConvertFormat Pass only support converting NCHW to NCHW4.");
  1620. if (new_inp[0]->shape().ndim == 5) {
  1621. mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8);
  1622. auto new_param = warp.param();
  1623. new_param.format = Format::NCHW4;
  1624. SymbolVar new_warp;
  1625. if (new_inp.size() == 3) {
  1626. new_warp = opr::WarpPerspectiveForward::make(
  1627. new_inp[0], new_inp[1], nullptr, new_inp[2], new_param,
  1628. opr->config());
  1629. } else {
  1630. mgb_assert(new_inp.size() == 4);
  1631. new_warp = opr::WarpPerspectiveForward::make(
  1632. new_inp[0], new_inp[1], new_inp[2], new_inp[3],
  1633. new_param, opr->config());
  1634. }
  1635. mgb_assert(new_warp.shape().ndim == 5,
  1636. "out var of WarpPerspective opr after transform must be "
  1637. "5 (got: "
  1638. "%zu).",
  1639. new_warp.shape().ndim);
  1640. return new_warp.node()->owner_opr();
  1641. }
  1642. auto new_opr =
  1643. serialization::copy_opr_shallow(*opr, new_inp, opr->config());
  1644. return new_opr;
  1645. };
  1646. auto&& replace_func = ret->m_opr_replace_func;
  1647. //! supportted nchw4
  1648. replace_func[opr::Convolution::typeinfo()] = replace_conv_opr;
  1649. replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr;
  1650. replace_func[opr::BatchConvBias::typeinfo()] =
  1651. replace_batch_conv_bias_opr;
  1652. replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr;
  1653. replace_func[opr::ResizeForward::typeinfo()] = replace_resize_opr;
  1654. replace_func[opr::WarpPerspectiveForward::typeinfo()] =
  1655. replace_warp_perspective_opr;
  1656. replace_func[opr::Elemwise::typeinfo()] = replace_elemwise_opr;
  1657. replace_func[opr::TypeCvt::typeinfo()] = replace_elemwise_opr;
  1658. replace_func[opr::ElemwiseMultiType::typeinfo()] = replace_elemwise_opr;
  1659. replace_func[opr::PowC::typeinfo()] = replace_elemwise_opr;
  1660. //! not supported nchw4
  1661. replace_func[opr::Concat::typeinfo()] = relayout_inp_to_nchw;
  1662. replace_func[opr::ConvolutionBackwardData::typeinfo()] =
  1663. relayout_inp_to_nchw;
  1664. replace_func[opr::Subtensor::typeinfo()] = relayout_inp_to_nchw;
  1665. replace_func[opr::GetVarShape::typeinfo()] = relayout_inp_to_nchw;
  1666. replace_func[opr::Dimshuffle::typeinfo()] = relayout_inp_to_nchw;
  1667. replace_func[opr::Reduce::typeinfo()] = relayout_inp_to_nchw;
  1668. replace_func[opr::AssertEqual::typeinfo()] = relayout_inp_to_nchw;
  1669. replace_func[opr::IncrSubtensor::typeinfo()] = relayout_inp_to_nchw;
  1670. replace_func[opr::WarpAffineForward::typeinfo()] = relayout_inp_to_nchw;
  1671. return ret;
  1672. }
  1673. /* ================ EnableNchwxxPass =============== */
  1674. VarNode* EnableNchwxxPass::on_graph_endpoint_var(VarNode* new_var,
  1675. VarNode* orig_var) const {
  1676. if (!orig_var->shape().eq_shape(new_var->shape())) {
  1677. if (m_pack_c_size == 8) {
  1678. return RelayoutPlaceholder::make(
  1679. new_var,
  1680. RelayoutPlaceholder::LayoutType::NCHW88_TO_NCHW)
  1681. .node();
  1682. } else if (m_pack_c_size == 4) {
  1683. return RelayoutPlaceholder::make(
  1684. new_var,
  1685. RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW)
  1686. .node();
  1687. }
  1688. }
  1689. return new_var;
  1690. }
  1691. void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size){
  1692. using RelayoutMode = RelayoutPlaceholder::LayoutType;
  1693. using TestFilterResult = std::pair<TransType, RelayoutMode>;
  1694. RelayoutMode weight_to_nchwxx_mode_dense =
  1695. RelayoutMode::WEIGHT_NCHW_TO_NCHW88_DENSE;
  1696. RelayoutMode weight_to_nchwxx_mode_group =
  1697. RelayoutMode::WEIGHT_NCHW_TO_NCHW88_GROUP;
  1698. RelayoutMode weight_to_nchwxx_mode_chan =
  1699. RelayoutMode::WEIGHT_NCHW_TO_NCHW88_CHAN;
  1700. RelayoutMode hybrid_nchw_nchwxx = RelayoutMode::WEIGHT_HYBIRD_NCHW_NCHW88;
  1701. RelayoutMode src_to_nchwxx_mode = RelayoutMode::NCHW_TO_NCHW88;
  1702. RelayoutMode src_to_nchw_mode = RelayoutMode::NCHW88_TO_NCHW;
  1703. megdnn::param::ConvBias::Format conv_bias_format =
  1704. megdnn::param::ConvBias::Format::NCHW88;
  1705. megdnn::param::Convolution::Format conv_format =
  1706. megdnn::param::ConvolutionV0::Format::NCHW88;
  1707. megdnn::param::Pooling::Format pooling_format =
  1708. megdnn::param::Pooling::Format::NCHW88;
  1709. std::string convter_pass_name = "conv_format_nchw88";
  1710. if (pack_c_size == 4) {
  1711. weight_to_nchwxx_mode_dense = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_DENSE;
  1712. weight_to_nchwxx_mode_group = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_GROUP;
  1713. weight_to_nchwxx_mode_chan = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_CHAN;
  1714. hybrid_nchw_nchwxx = RelayoutMode::WEIGHT_HYBIRD_NCHW_NCHW44;
  1715. src_to_nchwxx_mode = RelayoutMode::NCHW_TO_NCHW4;
  1716. src_to_nchw_mode = RelayoutMode::NCHW4_TO_NCHW;
  1717. conv_bias_format = megdnn::param::ConvBias::Format::NCHW44;
  1718. conv_format = megdnn::param::ConvolutionV0::Format::NCHW44;
  1719. pooling_format = megdnn::param::Pooling::Format::NCHW44;
  1720. convter_pass_name = "conv_format_nchw44";
  1721. }
  1722. auto test_trans_nchwxx =
  1723. [pack_c_size, weight_to_nchwxx_mode_dense,
  1724. weight_to_nchwxx_mode_group, weight_to_nchwxx_mode_chan,
  1725. hybrid_nchw_nchwxx](
  1726. const megdnn::param::Convolution::Sparse conv_mode,
  1727. const VarNode* filter) -> TestFilterResult {
  1728. TestFilterResult ret{TransType::TRANS_NONE, {}};
  1729. if (conv_mode == megdnn::param::Convolution::Sparse::DENSE) {
  1730. size_t IC = filter->shape()[1];
  1731. size_t OC = filter->shape()[0];
  1732. if ((IC % pack_c_size == 0) && (OC % pack_c_size == 0)) {
  1733. ret.first = TransType::TRANS_PURE_NCHWXX;
  1734. ret.second = weight_to_nchwxx_mode_dense;
  1735. } else if (IC < pack_c_size && OC % pack_c_size == 0) {
  1736. ret.first = TransType::TRANS_HYBIRD_NCHWXX;
  1737. ret.second = hybrid_nchw_nchwxx;
  1738. }
  1739. } else {
  1740. mgb_assert(conv_mode == megdnn::param::Convolution::Sparse::GROUP);
  1741. size_t group = filter->shape()[0];
  1742. size_t ocpg = filter->shape()[1];
  1743. size_t icpg = filter->shape()[2];
  1744. if (icpg == 1 && ocpg == 1 && (group % pack_c_size == 0)) {
  1745. ret.first = TransType::TRANS_PURE_NCHWXX;
  1746. ret.second = weight_to_nchwxx_mode_chan;
  1747. } else if ((icpg % pack_c_size == 0) && (ocpg % pack_c_size == 0)) {
  1748. ret.first = TransType::TRANS_PURE_NCHWXX;
  1749. ret.second = weight_to_nchwxx_mode_group;
  1750. }
  1751. }
  1752. return ret;
  1753. };
  1754. auto replace_conv_opr = [test_trans_nchwxx, conv_format, src_to_nchwxx_mode,
  1755. src_to_nchw_mode](OperatorNodeBase* opr,
  1756. const VarNodeArray& new_inp) {
  1757. mgb_assert(opr->input().size() == new_inp.size());
  1758. auto& conv_opr = opr->cast_final_safe<opr::ConvolutionForward>();
  1759. mgb_assert(conv_opr.param().format ==
  1760. megdnn::param::Convolution::Format::NCHW,
  1761. "ConvertFormat Pass only support converting NCHW to NCHWXX");
  1762. auto is_trans = test_trans_nchwxx(conv_opr.param().sparse, new_inp[1]);
  1763. //! can not trans to nchwxx
  1764. if (is_trans.first == TransType::TRANS_NONE) {
  1765. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  1766. new_inp[1]->shape().ndim == 5,
  1767. "The origin filter is not NCHW mode");
  1768. VarNodeArray temp_inp = new_inp;
  1769. //! if src is nchwxx, should RelayoutPlaceholder to nchw
  1770. if (temp_inp[0]->shape().ndim == 5) {
  1771. auto new_src =
  1772. RelayoutPlaceholder::make(new_inp[0], src_to_nchw_mode);
  1773. temp_inp[0] = new_src.node();
  1774. }
  1775. auto new_opr = serialization::copy_opr_shallow(*opr, temp_inp,
  1776. opr->config());
  1777. return new_opr;
  1778. } else if (is_trans.first == TransType::TRANS_PURE_NCHWXX) {
  1779. //! filter trans to nchwxx mode
  1780. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  1781. new_inp[1]->shape().ndim == 5,
  1782. "The origin filter is not NCHW mode");
  1783. VarNode *conv_src = new_inp[0], *conv_filter = new_inp[1];
  1784. auto new_filter =
  1785. RelayoutPlaceholder::make(new_inp[1], is_trans.second);
  1786. conv_filter = new_filter.node();
  1787. //! src trans to nchwxx mode
  1788. if (new_inp[0]->shape().ndim != 5) {
  1789. mgb_assert(new_inp[0]->shape().ndim == 4);
  1790. auto new_src = RelayoutPlaceholder::make(new_inp[0],
  1791. src_to_nchwxx_mode);
  1792. conv_src = new_src.node();
  1793. }
  1794. auto new_param = conv_opr.param();
  1795. new_param.format = conv_format;
  1796. mgb_assert(conv_src->shape().ndim == 5 &&
  1797. conv_filter->shape().ndim >= 6,
  1798. "The conv src dim is not trans to nchwxx");
  1799. auto new_conv_opr = opr::Convolution::make(
  1800. conv_src, conv_filter, new_param,
  1801. conv_opr.execution_policy(), conv_opr.config());
  1802. OperatorNodeBase* new_opr = new_conv_opr.node()->owner_opr();
  1803. mgb_assert(new_conv_opr.shape().ndim == 5,
  1804. "The conv dst dim is not trans to nchwxx");
  1805. return new_opr;
  1806. } else {
  1807. mgb_assert(is_trans.first == TransType::TRANS_HYBIRD_NCHWXX);
  1808. VarNode *conv_src = new_inp[0], *conv_filter = new_inp[1];
  1809. auto new_filter =
  1810. RelayoutPlaceholder::make(new_inp[1], is_trans.second);
  1811. conv_filter = new_filter.node();
  1812. mgb_assert(conv_src->shape().ndim == 4 &&
  1813. conv_filter->shape().ndim == 5,
  1814. "The src and filter is OK");
  1815. auto new_param = conv_opr.param();
  1816. new_param.format = conv_format;
  1817. auto new_conv_opr = opr::Convolution::make(
  1818. conv_src, conv_filter, new_param,
  1819. conv_opr.execution_policy(), conv_opr.config());
  1820. OperatorNodeBase* new_opr = new_conv_opr.node()->owner_opr();
  1821. mgb_assert(new_conv_opr.shape().ndim == 5,
  1822. "The conv dst dim is not trans to nchwxx");
  1823. return new_opr;
  1824. }
  1825. };
  1826. auto replace_conv_bias_opr = [test_trans_nchwxx, conv_bias_format,
  1827. src_to_nchwxx_mode, src_to_nchw_mode](
  1828. OperatorNodeBase* opr,
  1829. const VarNodeArray& new_inp) {
  1830. mgb_assert(opr->input().size() == new_inp.size());
  1831. auto& conv_bias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
  1832. mgb_assert(conv_bias_opr.param().format ==
  1833. megdnn::param::ConvBias::Format::NCHW,
  1834. "ConvertFormat Pass only support converting NCHW to NCHWXX");
  1835. auto is_trans =
  1836. test_trans_nchwxx(conv_bias_opr.param().sparse, new_inp[1]);
  1837. //! can not trans to nchwxx
  1838. if (is_trans.first == TransType::TRANS_NONE) {
  1839. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  1840. new_inp[1]->shape().ndim == 5,
  1841. "The origin filter is not NCHW mode");
  1842. VarNodeArray temp_inp = new_inp;
  1843. //! if src is nchwxx, should RelayoutPlaceholder to nchw
  1844. if (temp_inp[0]->shape().ndim == 5) {
  1845. auto new_src =
  1846. RelayoutPlaceholder::make(new_inp[0], src_to_nchw_mode);
  1847. temp_inp[0] = new_src.node();
  1848. }
  1849. //! the bias is nchwxx
  1850. if (temp_inp[2]->shape().ndim == 5) {
  1851. auto new_bias =
  1852. RelayoutPlaceholder::make(new_inp[2], src_to_nchw_mode);
  1853. temp_inp[2] = new_bias.node();
  1854. }
  1855. auto new_opr = serialization::copy_opr_shallow(*opr, temp_inp,
  1856. opr->config());
  1857. return new_opr;
  1858. } else if (is_trans.first == TransType::TRANS_PURE_NCHWXX) {
  1859. VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
  1860. *conv_bias_bias = new_inp[2];
  1861. //! filter trans to nchwxx mode
  1862. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  1863. new_inp[1]->shape().ndim == 5,
  1864. "The origin filter is not NCHW mode");
  1865. auto new_filter =
  1866. RelayoutPlaceholder::make(new_inp[1], is_trans.second);
  1867. conv_bias_filter = new_filter.node();
  1868. //! src trans to nchwxx mode
  1869. if (new_inp[0]->shape().ndim != 5) {
  1870. mgb_assert(new_inp[0]->shape().ndim == 4);
  1871. auto new_src = RelayoutPlaceholder::make(new_inp[0],
  1872. src_to_nchwxx_mode);
  1873. conv_bias_src = new_src.node();
  1874. }
  1875. //! bias trans to nchwxx mode, bias may be scale
  1876. if (new_inp[2]->shape().ndim == 4) {
  1877. auto new_bias = RelayoutPlaceholder::make(new_inp[2],
  1878. src_to_nchwxx_mode);
  1879. conv_bias_bias = new_bias.node();
  1880. }
  1881. auto new_param = conv_bias_opr.param();
  1882. new_param.format = conv_bias_format;
  1883. mgb_assert(conv_bias_src->shape().ndim == 5 &&
  1884. conv_bias_filter->shape().ndim >= 6,
  1885. "The conv_bias src dim is not trans to nchwxx");
  1886. auto new_conv_bias_opr = opr::ConvBias::make(
  1887. conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
  1888. conv_bias_opr.execution_policy(), conv_bias_opr.config());
  1889. OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
  1890. mgb_assert(new_conv_bias_opr.shape().ndim == 5,
  1891. "The conv_bias dst dim is not trans to nchwxx");
  1892. return new_opr;
  1893. } else {
  1894. mgb_assert(is_trans.first == TransType::TRANS_HYBIRD_NCHWXX);
  1895. VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
  1896. *conv_bias_bias = new_inp[2];
  1897. auto new_filter =
  1898. RelayoutPlaceholder::make(new_inp[1], is_trans.second);
  1899. conv_bias_filter = new_filter.node();
  1900. //! bias trans to nchwxx mode, bias may be scale
  1901. if (new_inp[2]->shape().ndim == 4) {
  1902. auto new_bias = RelayoutPlaceholder::make(new_inp[2],
  1903. src_to_nchwxx_mode);
  1904. conv_bias_bias = new_bias.node();
  1905. }
  1906. mgb_assert(conv_bias_src->shape().ndim == 4 &&
  1907. conv_bias_filter->shape().ndim == 5);
  1908. mgb_assert((conv_bias_bias->shape().ndim == 5) ||
  1909. conv_bias_bias->shape().is_scalar());
  1910. auto new_param = conv_bias_opr.param();
  1911. new_param.format = conv_bias_format;
  1912. auto new_conv_bias_opr = opr::ConvBias::make(
  1913. conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
  1914. conv_bias_opr.execution_policy(), conv_bias_opr.config());
  1915. OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
  1916. mgb_assert(new_conv_bias_opr.shape().ndim == 5,
  1917. "The conv dst dim is not trans to nchwxx");
  1918. return new_opr;
  1919. }
  1920. };
  1921. auto replace_pooling_opr = [=](OperatorNodeBase* opr,
  1922. const VarNodeArray& new_inp) {
  1923. mgb_assert(opr->input().size() == new_inp.size());
  1924. auto& pooling_opr = opr->cast_final_safe<opr::PoolingForward>();
  1925. mgb_assert(pooling_opr.param().format ==
  1926. megdnn::param::Pooling::Format::NCHW,
  1927. "ConvertFormat Pass only support converting NCHW to NCHWxx");
  1928. VarNode* inp = new_inp[0];
  1929. //! if input is nchwxx
  1930. if (inp->shape().ndim == 5) {
  1931. auto new_param = pooling_opr.param();
  1932. new_param.format = pooling_format;
  1933. auto new_pooling_opr =
  1934. opr::PoolingForward::make(inp, new_param, opr->config());
  1935. mgb_assert(new_pooling_opr.shape().ndim == 5,
  1936. "The pooling dst dim is not trans to nchwxx");
  1937. return new_pooling_opr.node()->owner_opr();
  1938. } else {
  1939. auto new_opr = serialization::copy_opr_shallow(*opr, new_inp,
  1940. opr->config());
  1941. return new_opr;
  1942. }
  1943. };
  1944. auto replace_concat_opr = [=](OperatorNodeBase* opr,
  1945. const VarNodeArray& new_inp) {
  1946. mgb_assert(opr->input().size() == new_inp.size());
  1947. bool has_inp_changed = false;
  1948. bool can_exec_ncwxx = true;
  1949. for (size_t i = 0; i < opr->input().size(); i++) {
  1950. if (new_inp[i]->shape().ndim == 5) {
  1951. has_inp_changed = true;
  1952. break;
  1953. } else if (new_inp[i]->shape().ndim == 4) {
  1954. if (new_inp[i]->shape()[1] % pack_c_size != 0) {
  1955. can_exec_ncwxx = false;
  1956. }
  1957. }
  1958. }
  1959. if (has_inp_changed) {
  1960. auto temp_inp = new_inp;
  1961. if (can_exec_ncwxx) {
  1962. for (size_t i = 0; i < opr->input().size(); i++) {
  1963. if (new_inp[i]->shape().ndim == 4) {
  1964. auto new_var = RelayoutPlaceholder::make(
  1965. new_inp[i], src_to_nchwxx_mode);
  1966. temp_inp[i] = new_var.node();
  1967. } else {
  1968. mgb_assert((new_inp[i]->shape().ndim == 5) ||
  1969. new_inp[i]->shape().is_scalar());
  1970. }
  1971. }
  1972. } else {
  1973. for (size_t i = 0; i < opr->input().size(); i++) {
  1974. if (new_inp[i]->shape().ndim == 5) {
  1975. auto new_var = RelayoutPlaceholder::make(
  1976. new_inp[i], src_to_nchw_mode);
  1977. temp_inp[i] = new_var.node();
  1978. }
  1979. }
  1980. }
  1981. return serialization::copy_opr_shallow(*opr, temp_inp,
  1982. opr->config());
  1983. } else {
  1984. return serialization::copy_opr_shallow(*opr, new_inp,
  1985. opr->config());
  1986. }
  1987. };
  1988. auto replace_elemwise_opr = [=](OperatorNodeBase* opr,
  1989. const VarNodeArray& new_inp) {
  1990. mgb_assert(opr->input().size() == new_inp.size());
  1991. bool has_inp_changed = false;
  1992. for (size_t i = 0; i < opr->input().size(); i++) {
  1993. if (new_inp[i]->shape().ndim == 5) {
  1994. has_inp_changed = true;
  1995. break;
  1996. }
  1997. }
  1998. if (has_inp_changed) {
  1999. auto temp_inp = new_inp;
  2000. for (size_t i = 0; i < opr->input().size(); i++) {
  2001. if (new_inp[i]->shape().ndim == 4) {
  2002. auto new_var = RelayoutPlaceholder::make(
  2003. new_inp[i], src_to_nchwxx_mode);
  2004. temp_inp[i] = new_var.node();
  2005. } else {
  2006. mgb_assert((new_inp[i]->shape().ndim == 5) ||
  2007. new_inp[i]->shape().is_scalar());
  2008. }
  2009. }
  2010. return serialization::copy_opr_shallow(*opr, temp_inp,
  2011. opr->config());
  2012. } else {
  2013. return serialization::copy_opr_shallow(*opr, new_inp,
  2014. opr->config());
  2015. }
  2016. };
  2017. auto relayout_inp_to_nchw = [=](OperatorNodeBase* opr,
  2018. const VarNodeArray& new_inp) {
  2019. mgb_assert(opr->input().size() == new_inp.size());
  2020. VarNodeArray temp_inp = new_inp;
  2021. for (size_t i = 0; i < opr->input().size(); i++) {
  2022. if (!opr->input(i)->shape().eq_shape(new_inp[i]->shape())) {
  2023. mgb_assert(opr->input(i)->shape().ndim == 4);
  2024. mgb_assert(new_inp[i]->shape().ndim == 5);
  2025. auto new_var =
  2026. RelayoutPlaceholder::make(new_inp[i], src_to_nchw_mode);
  2027. temp_inp[i] = new_var.node();
  2028. }
  2029. }
  2030. return serialization::copy_opr_shallow(*opr, temp_inp, opr->config());
  2031. };
  2032. auto&& replace_func = m_opr_replace_func;
  2033. //! supportted nchwxx
  2034. replace_func[opr::Convolution::typeinfo()] = replace_conv_opr;
  2035. replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr;
  2036. replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr;
  2037. replace_func[opr::Concat::typeinfo()] = replace_concat_opr;
  2038. replace_func[opr::Elemwise::typeinfo()] = replace_elemwise_opr;
  2039. replace_func[opr::TypeCvt::typeinfo()] = replace_elemwise_opr;
  2040. replace_func[opr::ElemwiseMultiType::typeinfo()] = replace_elemwise_opr;
  2041. replace_func[opr::PowC::typeinfo()] = replace_elemwise_opr;
  2042. //! not support yet
  2043. replace_func[opr::ConvolutionBackwardData::typeinfo()] =
  2044. relayout_inp_to_nchw;
  2045. replace_func[opr::Subtensor::typeinfo()] = relayout_inp_to_nchw;
  2046. replace_func[opr::GetVarShape::typeinfo()] = relayout_inp_to_nchw;
  2047. replace_func[opr::Dimshuffle::typeinfo()] = relayout_inp_to_nchw;
  2048. replace_func[opr::Reduce::typeinfo()] = relayout_inp_to_nchw;
  2049. replace_func[opr::AssertEqual::typeinfo()] = relayout_inp_to_nchw;
  2050. replace_func[opr::IncrSubtensor::typeinfo()] = relayout_inp_to_nchw;
  2051. replace_func[opr::ResizeForward::typeinfo()] = relayout_inp_to_nchw;
  2052. replace_func[opr::WarpPerspectiveForward::typeinfo()] =
  2053. relayout_inp_to_nchw;
  2054. replace_func[opr::WarpAffineForward::typeinfo()] = relayout_inp_to_nchw;
  2055. }
  2056. std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
  2057. size_t pack_c_size) {
  2058. auto ret = std::make_unique<EnableNchwxxPass>(pack_c_size);
  2059. ret->set_var_replace_check_flag(VarReplaceCheckFlag::NOCHECK);
  2060. std::string convter_pass_name = "conv_format_nchw88";
  2061. if (pack_c_size == 4) {
  2062. convter_pass_name = "conv_format_nchw44";
  2063. }
  2064. ret->fill_opr_convert_fun(pack_c_size);
  2065. ret->set_name(convter_pass_name);
  2066. return ret;
  2067. }
  2068. /* ================ EnableNchw44DotPass =============== */
  2069. VarNode* EnableNchw44DotPass::on_graph_endpoint_var(VarNode* new_var,
  2070. VarNode* orig_var) const {
  2071. if (!orig_var->shape().eq_shape(new_var->shape())) {
  2072. return RelayoutPlaceholder::make(
  2073. new_var, RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW)
  2074. .node();
  2075. }
  2076. return new_var;
  2077. }
  2078. std::unique_ptr<EnableNchw44DotPass>
  2079. EnableNchw44DotPass::make_nchw44_dot_converter() {
  2080. auto ret = std::make_unique<EnableNchw44DotPass>();
  2081. ret->set_var_replace_check_flag(VarReplaceCheckFlag::NOCHECK);
  2082. //! First is whether the conv can trans to nchwxx, second is the filter
  2083. //! trans mode
  2084. using RelayoutMode = RelayoutPlaceholder::LayoutType;
  2085. struct TestTransResult {
  2086. TransType trans_type;
  2087. RelayoutMode relayout_mod;
  2088. megdnn::param::ConvolutionV0::Format conv_format;
  2089. };
  2090. constexpr size_t pack_c_size = 4_z;
  2091. auto test_trans_nchw44_dot =
  2092. [](const megdnn::param::Convolution::Sparse conv_mode,
  2093. const VarNode* filter) -> TestTransResult {
  2094. TestTransResult ret{TransType::TRANS_NONE, {}, {}};
  2095. if (conv_mode == megdnn::param::Convolution::Sparse::DENSE) {
  2096. size_t IC = filter->shape()[1];
  2097. size_t OC = filter->shape()[0];
  2098. if ((IC % pack_c_size == 0) && (OC % pack_c_size == 0)) {
  2099. ret.trans_type = TransType::TRANS_PURE_NCHWXX;
  2100. ret.relayout_mod = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_DOT_DENSE;
  2101. ret.conv_format = megdnn::param::ConvBias::Format::NCHW44_DOT;
  2102. } else if (IC < pack_c_size && OC % pack_c_size == 0) {
  2103. ret.trans_type = TransType::TRANS_HYBIRD_NCHWXX;
  2104. ret.relayout_mod = RelayoutMode::WEIGHT_HYBIRD_NCHW_NCHW44;
  2105. ret.conv_format = megdnn::param::ConvBias::Format::NCHW44_DOT;
  2106. }
  2107. } else {
  2108. mgb_assert(conv_mode == megdnn::param::Convolution::Sparse::GROUP);
  2109. size_t group = filter->shape()[0];
  2110. size_t ocpg = filter->shape()[1];
  2111. size_t icpg = filter->shape()[2];
  2112. if (icpg == 1 && ocpg == 1 && (group % pack_c_size == 0)) {
  2113. ret.trans_type = TransType::TRANS_PURE_NCHWXX;
  2114. ret.relayout_mod = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_CHAN;
  2115. ret.conv_format = megdnn::param::ConvBias::Format::NCHW44;
  2116. } else if ((icpg % pack_c_size == 0) && (ocpg % pack_c_size == 0)) {
  2117. ret.trans_type = TransType::TRANS_PURE_NCHWXX;
  2118. ret.relayout_mod = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_DOT_GROUP;
  2119. ret.conv_format = megdnn::param::ConvBias::Format::NCHW44_DOT;
  2120. }
  2121. }
  2122. return ret;
  2123. };
  2124. auto replace_conv_opr = [test_trans_nchw44_dot](
  2125. OperatorNodeBase* opr,
  2126. const VarNodeArray& new_inp) {
  2127. mgb_assert(opr->input().size() == new_inp.size());
  2128. auto& conv_opr = opr->cast_final_safe<opr::ConvolutionForward>();
  2129. mgb_assert(conv_opr.param().format ==
  2130. megdnn::param::Convolution::Format::NCHW,
  2131. "ConvertFormat Pass only support converting NCHW to "
  2132. "NCHW44_DOT");
  2133. auto is_trans =
  2134. test_trans_nchw44_dot(conv_opr.param().sparse, new_inp[1]);
  2135. //! can not trans to nchwxx
  2136. if (is_trans.trans_type == TransType::TRANS_NONE) {
  2137. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  2138. new_inp[1]->shape().ndim == 5,
  2139. "The origin filter is not NCHW mode");
  2140. VarNodeArray temp_inp = new_inp;
  2141. //! if src is nchwxx, should RelayoutPlaceholder to nchw
  2142. if (temp_inp[0]->shape().ndim == 5) {
  2143. auto new_src = RelayoutPlaceholder::make(
  2144. new_inp[0], RelayoutMode::NCHW4_TO_NCHW);
  2145. temp_inp[0] = new_src.node();
  2146. }
  2147. auto new_opr = serialization::copy_opr_shallow(*opr, temp_inp,
  2148. opr->config());
  2149. return new_opr;
  2150. } else if (is_trans.trans_type == TransType::TRANS_PURE_NCHWXX) {
  2151. //! filter trans to nchwxx mode
  2152. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  2153. new_inp[1]->shape().ndim == 5,
  2154. "The origin filter is not NCHW mode");
  2155. VarNode *conv_src = new_inp[0], *conv_filter = new_inp[1];
  2156. auto new_filter = RelayoutPlaceholder::make(new_inp[1],
  2157. is_trans.relayout_mod);
  2158. conv_filter = new_filter.node();
  2159. //! src trans to nchwxx mode
  2160. if (new_inp[0]->shape().ndim != 5) {
  2161. mgb_assert(new_inp[0]->shape().ndim == 4);
  2162. auto new_src = RelayoutPlaceholder::make(
  2163. new_inp[0], RelayoutMode::NCHW_TO_NCHW4);
  2164. conv_src = new_src.node();
  2165. }
  2166. auto new_param = conv_opr.param();
  2167. new_param.format = is_trans.conv_format;
  2168. mgb_assert(conv_src->shape().ndim == 5 &&
  2169. conv_filter->shape().ndim >= 6,
  2170. "The conv src dim is not trans to nchwxx");
  2171. auto new_conv_opr = opr::Convolution::make(
  2172. conv_src, conv_filter, new_param,
  2173. conv_opr.execution_policy(), conv_opr.config());
  2174. OperatorNodeBase* new_opr = new_conv_opr.node()->owner_opr();
  2175. mgb_assert(new_conv_opr.shape().ndim == 5,
  2176. "The conv dst dim is not trans to nchwxx");
  2177. return new_opr;
  2178. } else {
  2179. mgb_assert(is_trans.trans_type == TransType::TRANS_HYBIRD_NCHWXX);
  2180. VarNode *conv_src = new_inp[0], *conv_filter = new_inp[1];
  2181. auto new_filter = RelayoutPlaceholder::make(new_inp[1],
  2182. is_trans.relayout_mod);
  2183. conv_filter = new_filter.node();
  2184. mgb_assert(conv_src->shape().ndim == 4 &&
  2185. conv_filter->shape().ndim == 5,
  2186. "The src and filter is OK");
  2187. auto new_param = conv_opr.param();
  2188. new_param.format = is_trans.conv_format;
  2189. auto new_conv_opr = opr::Convolution::make(
  2190. conv_src, conv_filter, new_param,
  2191. conv_opr.execution_policy(), conv_opr.config());
  2192. OperatorNodeBase* new_opr = new_conv_opr.node()->owner_opr();
  2193. mgb_assert(new_conv_opr.shape().ndim == 5,
  2194. "The conv dst dim is not trans to nchwxx");
  2195. return new_opr;
  2196. }
  2197. };
  2198. auto replace_conv_bias_opr = [test_trans_nchw44_dot](
  2199. OperatorNodeBase* opr,
  2200. const VarNodeArray& new_inp) {
  2201. mgb_assert(opr->input().size() == new_inp.size());
  2202. auto& conv_bias_opr = opr->cast_final_safe<opr::ConvBiasForward>();
  2203. mgb_assert(conv_bias_opr.param().format ==
  2204. megdnn::param::ConvBias::Format::NCHW,
  2205. "ConvertFormat Pass only support converting NCHW to NCHWXX");
  2206. auto is_trans =
  2207. test_trans_nchw44_dot(conv_bias_opr.param().sparse, new_inp[1]);
  2208. //! can not trans to nchwxx
  2209. if (is_trans.trans_type == TransType::TRANS_NONE) {
  2210. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  2211. new_inp[1]->shape().ndim == 5,
  2212. "The origin filter is not NCHW mode");
  2213. VarNodeArray temp_inp = new_inp;
  2214. //! if src is nchwxx, should RelayoutPlaceholder to nchw
  2215. if (temp_inp[0]->shape().ndim == 5) {
  2216. auto new_src = RelayoutPlaceholder::make(
  2217. new_inp[0], RelayoutMode::NCHW4_TO_NCHW);
  2218. temp_inp[0] = new_src.node();
  2219. }
  2220. //! the bias is nchwxx
  2221. if (temp_inp[2]->shape().ndim == 5) {
  2222. auto new_bias = RelayoutPlaceholder::make(
  2223. new_inp[2], RelayoutMode::NCHW4_TO_NCHW);
  2224. temp_inp[2] = new_bias.node();
  2225. }
  2226. auto new_opr = serialization::copy_opr_shallow(*opr, temp_inp,
  2227. opr->config());
  2228. return new_opr;
  2229. } else if (is_trans.trans_type == TransType::TRANS_PURE_NCHWXX) {
  2230. VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
  2231. *conv_bias_bias = new_inp[2];
  2232. //! filter trans to nchwxx mode
  2233. mgb_assert(new_inp[1]->shape().ndim == 4 ||
  2234. new_inp[1]->shape().ndim == 5,
  2235. "The origin filter is not NCHW mode");
  2236. auto new_filter = RelayoutPlaceholder::make(new_inp[1],
  2237. is_trans.relayout_mod);
  2238. conv_bias_filter = new_filter.node();
  2239. //! src trans to nchwxx mode
  2240. if (new_inp[0]->shape().ndim != 5) {
  2241. mgb_assert(new_inp[0]->shape().ndim == 4);
  2242. auto new_src = RelayoutPlaceholder::make(
  2243. new_inp[0], RelayoutMode::NCHW_TO_NCHW4);
  2244. conv_bias_src = new_src.node();
  2245. }
  2246. //! bias trans to nchwxx mode, bias may be scale
  2247. if (new_inp[2]->shape().ndim == 4) {
  2248. auto new_bias = RelayoutPlaceholder::make(
  2249. new_inp[2], RelayoutMode::NCHW_TO_NCHW4);
  2250. conv_bias_bias = new_bias.node();
  2251. }
  2252. auto new_param = conv_bias_opr.param();
  2253. new_param.format = is_trans.conv_format;
  2254. mgb_assert(conv_bias_src->shape().ndim == 5 &&
  2255. conv_bias_filter->shape().ndim >= 6,
  2256. "The conv_bias src dim is not trans to nchwxx");
  2257. auto new_conv_bias_opr = opr::ConvBias::make(
  2258. conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
  2259. conv_bias_opr.execution_policy(), conv_bias_opr.config());
  2260. OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
  2261. mgb_assert(new_conv_bias_opr.shape().ndim == 5,
  2262. "The conv_bias dst dim is not trans to nchwxx");
  2263. return new_opr;
  2264. } else {
  2265. mgb_assert(is_trans.trans_type == TransType::TRANS_HYBIRD_NCHWXX);
  2266. VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1],
  2267. *conv_bias_bias = new_inp[2];
  2268. auto new_filter = RelayoutPlaceholder::make(new_inp[1],
  2269. is_trans.relayout_mod);
  2270. conv_bias_filter = new_filter.node();
  2271. //! bias trans to nchwxx mode, bias may be scale
  2272. if (new_inp[2]->shape().ndim == 4) {
  2273. auto new_bias = RelayoutPlaceholder::make(
  2274. new_inp[2], RelayoutMode::NCHW_TO_NCHW4);
  2275. conv_bias_bias = new_bias.node();
  2276. }
  2277. mgb_assert(conv_bias_src->shape().ndim == 4 &&
  2278. conv_bias_filter->shape().ndim == 5);
  2279. mgb_assert((conv_bias_bias->shape().ndim == 5) ||
  2280. conv_bias_bias->shape().is_scalar());
  2281. auto new_param = conv_bias_opr.param();
  2282. new_param.format = is_trans.conv_format;
  2283. auto new_conv_bias_opr = opr::ConvBias::make(
  2284. conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
  2285. conv_bias_opr.execution_policy(), conv_bias_opr.config());
  2286. OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
  2287. mgb_assert(new_conv_bias_opr.shape().ndim == 5,
  2288. "The conv dst dim is not trans to nchwxx");
  2289. return new_opr;
  2290. }
  2291. };
  2292. ret->fill_opr_convert_fun(4);
  2293. auto&& replace_func = ret->m_opr_replace_func;
  2294. //! supportted nchwxx
  2295. replace_func[opr::Convolution::typeinfo()] = replace_conv_opr;
  2296. replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr;
  2297. return ret;
  2298. }
  2299. /* ==================== ShuffleShuffleRemovePass ================= */
  2300. class ShuffleShuffleRemovePass::Impl {
  2301. using TensorFormat = opr::ConvBias::Param::Format;
  2302. OptState& m_opt_state;
  2303. ThinHashMap<std::pair<TensorFormat, TensorFormat>,
  2304. thin_function<VarNode*(VarNode*)>>
  2305. m_reformat;
  2306. class AbstractShuffleOpr;
  2307. void detect_shuffle_operations();
  2308. void do_replace();
  2309. public:
  2310. Impl(OptState& opt_state) : m_opt_state{opt_state} {
  2311. m_reformat[std::make_pair(TensorFormat::NCHW, TensorFormat::NCHW4)] =
  2312. [](VarNode* inp) -> VarNode* {
  2313. auto x = SymbolVar(inp);
  2314. auto xshp = opr::GetVarShape::make(x);
  2315. auto cv = [&x](int v) { return x.make_scalar(v); };
  2316. auto sub = [&xshp, &cv](int idx) {
  2317. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2318. };
  2319. auto tshp = opr::Concat::make(
  2320. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2321. auto y0 = opr::Reshape::make(x, tshp);
  2322. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2323. return y1.node();
  2324. };
  2325. m_reformat[std::make_pair(TensorFormat::NCHW, TensorFormat::NCHW32)] =
  2326. [](VarNode* inp) -> VarNode* {
  2327. auto x = SymbolVar(inp);
  2328. auto xshp = opr::GetVarShape::make(x);
  2329. auto cv = [&x](int v) { return x.make_scalar(v); };
  2330. auto sub = [&xshp, &cv](int idx) {
  2331. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2332. };
  2333. auto tshp = opr::Concat::make(
  2334. {sub(0), sub(1) / 32, cv(32), sub(2), sub(3)}, 0);
  2335. auto y0 = opr::Reshape::make(x, tshp);
  2336. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
  2337. return y1.node();
  2338. };
  2339. m_reformat[std::make_pair(TensorFormat::NCHW4, TensorFormat::NCHW)] =
  2340. [](VarNode* inp) -> VarNode* {
  2341. mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 4);
  2342. auto x = SymbolVar(inp);
  2343. auto xshp = opr::GetVarShape::make(x);
  2344. auto cv = [&x](int v) { return x.make_scalar(v); };
  2345. auto sub = [&xshp, &cv](int idx) {
  2346. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2347. };
  2348. auto tshp =
  2349. opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
  2350. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2351. auto y1 = opr::Reshape::make(y0, tshp);
  2352. return y1.node();
  2353. };
  2354. m_reformat[std::make_pair(TensorFormat::NCHW32, TensorFormat::NCHW)] =
  2355. [](VarNode* inp) -> VarNode* {
  2356. mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 32);
  2357. auto x = SymbolVar(inp);
  2358. auto xshp = opr::GetVarShape::make(x);
  2359. auto cv = [&x](int v) { return x.make_scalar(v); };
  2360. auto sub = [&xshp, &cv](int idx) {
  2361. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2362. };
  2363. auto tshp =
  2364. opr::Concat::make({sub(0), sub(1) * 32, sub(2), sub(3)}, 0);
  2365. auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
  2366. auto y1 = opr::Reshape::make(y0, tshp);
  2367. return y1.node();
  2368. };
  2369. m_reformat[std::make_pair(TensorFormat::NCHW4, TensorFormat::NCHW32)] =
  2370. [](VarNode* inp) -> VarNode* {
  2371. mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 4);
  2372. auto x = SymbolVar(inp);
  2373. auto xshp = opr::GetVarShape::make(x);
  2374. auto cv = [&x](int v) { return x.make_scalar(v); };
  2375. auto sub = [&xshp, &cv](int idx) {
  2376. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2377. };
  2378. auto tshp0 = opr::Concat::make(
  2379. {sub(0), sub(1) / 8, cv(8), sub(2), sub(3), sub(4)},
  2380. 0),
  2381. tshp1 = opr::Concat::make(
  2382. {sub(0), sub(1) / 8, sub(2), sub(3), sub(4) * 8}, 0);
  2383. auto y0 = opr::Reshape::make(x, tshp0);
  2384. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2, 5});
  2385. auto y2 = opr::Reshape::make(y1, tshp1);
  2386. return y2.node();
  2387. };
  2388. m_reformat[std::make_pair(TensorFormat::NCHW32, TensorFormat::NCHW4)] =
  2389. [](VarNode* inp) -> VarNode* {
  2390. mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 32);
  2391. auto x = SymbolVar(inp);
  2392. auto xshp = opr::GetVarShape::make(x);
  2393. auto cv = [&x](int v) { return x.make_scalar(v); };
  2394. auto sub = [&xshp, &cv](int idx) {
  2395. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2396. };
  2397. auto tshp0 = opr::Concat::make(
  2398. {sub(0), sub(1), sub(2), sub(3), cv(8), sub(4) / 8},
  2399. 0),
  2400. tshp1 = opr::Concat::make(
  2401. {sub(0), sub(1) * 8, sub(2), sub(3), sub(4) / 8}, 0);
  2402. auto y0 = opr::Reshape::make(x, tshp0);
  2403. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 4, 2, 3, 5});
  2404. auto y2 = opr::Reshape::make(y1, tshp1);
  2405. return y2.node();
  2406. };
  2407. m_reformat[std::make_pair(TensorFormat::NCHW4, TensorFormat::CHWN4)] =
  2408. [](VarNode* inp) -> VarNode* {
  2409. megdnn::param::RelayoutFormat param;
  2410. param.mode = megdnn::param::RelayoutFormat::Mode::NCHW4_CHWN4;
  2411. auto reformat = opr::RelayoutFormat::make(inp, param);
  2412. return reformat.node();
  2413. };
  2414. m_reformat[std::make_pair(TensorFormat::CHWN4, TensorFormat::NCHW4)] =
  2415. [](VarNode* inp) -> VarNode* {
  2416. megdnn::param::RelayoutFormat param;
  2417. param.mode = megdnn::param::RelayoutFormat::Mode::CHWN4_NCHW4;
  2418. auto reformat = opr::RelayoutFormat::make(inp, param);
  2419. return reformat.node();
  2420. };
  2421. m_reformat[std::make_pair(TensorFormat::NCHW, TensorFormat::CHWN4)] =
  2422. [](VarNode* inp) -> VarNode* {
  2423. auto x = SymbolVar(inp);
  2424. auto xshp = opr::GetVarShape::make(x);
  2425. auto cv = [&x](int v) { return x.make_scalar(v); };
  2426. auto sub = [&xshp, &cv](int idx) {
  2427. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2428. };
  2429. auto tshp = opr::Concat::make(
  2430. {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0);
  2431. auto y0 = opr::Reshape::make(x, tshp);
  2432. auto y1 = opr::Dimshuffle::make(y0, {1, 3, 4, 0, 2});
  2433. return y1.node();
  2434. };
  2435. m_reformat[std::make_pair(TensorFormat::CHWN4, TensorFormat::NCHW)] =
  2436. [](VarNode* inp) -> VarNode* {
  2437. mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 4);
  2438. auto x = SymbolVar(inp);
  2439. auto xshp = opr::GetVarShape::make(x);
  2440. auto cv = [&x](int v) { return x.make_scalar(v); };
  2441. auto sub = [&xshp, &cv](int idx) {
  2442. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  2443. };
  2444. auto tshp =
  2445. opr::Concat::make({sub(3), sub(0) * 4, sub(1), sub(2)}, 0);
  2446. auto y0 = opr::Dimshuffle::make(x, {3, 0, 4, 1, 2});
  2447. auto y1 = opr::Reshape::make(y0, tshp);
  2448. return y1.node();
  2449. };
  2450. detect_shuffle_operations();
  2451. do_replace();
  2452. }
  2453. };
  2454. /*!
  2455. * \brief abstract operator representation of shuffle operation
  2456. */
  2457. MGB_DEFINE_OPR_CLASS(ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr,
  2458. cg::SingleCNOperatorNodeBase) // {
  2459. public:
  2460. AbstractShuffleOpr(VarNode* inpvar, TensorFormat inp_format,
  2461. TensorFormat out_format);
  2462. static SymbolVar make(VarNode* inpvar, TensorFormat inp_format,
  2463. TensorFormat out_format);
  2464. TensorFormat inp_format() const { return m_inp_format; }
  2465. TensorFormat out_format() const { return m_out_format; }
  2466. private:
  2467. void init_output_static_infer_desc() override;
  2468. void scn_do_execute() override;
  2469. const TensorFormat m_inp_format;
  2470. const TensorFormat m_out_format;
  2471. };
  2472. MGB_DYN_TYPE_OBJ_FINAL_IMPL(ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr);
  2473. void ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr::scn_do_execute() {
  2474. mgb_throw(InternalError, "AbstractShuffleOpr cannot be executed");
  2475. }
  2476. void ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr::
  2477. init_output_static_infer_desc() {
  2478. using namespace cg::static_infer;
  2479. auto&& mgr = owner_graph()->static_infer_manager();
  2480. DepVal deps;
  2481. for (auto i : input())
  2482. deps.push_back({i, DepType::SHAPE});
  2483. auto infer_shape = [this](TensorShape& dst, const InpVal& inp) {
  2484. TensorShape inp_shape = inp.val[0].shape();
  2485. if (m_inp_format == TensorFormat::NCHW4 &&
  2486. m_out_format == TensorFormat::NCHW32) {
  2487. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 4);
  2488. dst = inp_shape;
  2489. dst[0] = inp_shape[0];
  2490. dst[1] = inp_shape[1] / 8;
  2491. dst[2] = inp_shape[2];
  2492. dst[3] = inp_shape[3];
  2493. dst[4] = inp_shape[4] * 8;
  2494. } else if (m_inp_format == TensorFormat::NCHW32 &&
  2495. m_out_format == TensorFormat::NCHW4) {
  2496. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 32);
  2497. dst = inp_shape;
  2498. dst[0] = inp_shape[0];
  2499. dst[1] = inp_shape[1] * 8;
  2500. dst[2] = inp_shape[2];
  2501. dst[3] = inp_shape[3];
  2502. dst[4] = inp_shape[4] / 8;
  2503. } else if (m_inp_format == TensorFormat::NCHW &&
  2504. m_out_format == TensorFormat::NCHW4) {
  2505. mgb_assert(inp_shape.ndim == 4);
  2506. dst.ndim = 5;
  2507. dst[0] = inp_shape[0];
  2508. dst[1] = inp_shape[1] / 4;
  2509. dst[2] = inp_shape[2];
  2510. dst[3] = inp_shape[3];
  2511. dst[4] = 4;
  2512. } else if (m_inp_format == TensorFormat::NCHW4 &&
  2513. m_out_format == TensorFormat::NCHW) {
  2514. mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 4);
  2515. dst.ndim = 4;
  2516. dst[0] = inp_shape[0];
  2517. dst[1] = inp_shape[1] * 4;
  2518. dst[2] = inp_shape[2];
  2519. dst[3] = inp_shape[3];
  2520. } else if (m_inp_format == TensorFormat::NCHW4 &&
  2521. m_out_format == TensorFormat::CHWN4) {
  2522. dst.ndim = 5;
  2523. dst[0] = inp_shape[1];
  2524. dst[1] = inp_shape[2];
  2525. dst[2] = inp_shape[3];
  2526. dst[3] = inp_shape[0];
  2527. dst[4] = inp_shape[4];
  2528. } else if (m_inp_format == TensorFormat::CHWN4 &&
  2529. m_out_format == TensorFormat::NCHW4) {
  2530. dst.ndim = 5;
  2531. dst[0] = inp_shape[3];
  2532. dst[1] = inp_shape[0];
  2533. dst[2] = inp_shape[1];
  2534. dst[3] = inp_shape[2];
  2535. dst[4] = inp_shape[4];
  2536. } else {
  2537. mgb_throw(InternalError,
  2538. "Unsupported input format and output format.");
  2539. }
  2540. return true;
  2541. };
  2542. mgr.register_shape_infer(output(0), {SourceType::DEP, deps, infer_shape});
  2543. }
  2544. ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr::AbstractShuffleOpr(
  2545. VarNode* inpvar, TensorFormat inp_format, TensorFormat out_format)
  2546. : Super(inpvar->owner_graph(), {}, "AbstractShuffleOpr", {inpvar}),
  2547. m_inp_format{inp_format},
  2548. m_out_format{out_format} {
  2549. add_input({inpvar});
  2550. add_equivalence_component<ScalarHash<TensorFormat>>(m_inp_format);
  2551. add_equivalence_component<ScalarHash<TensorFormat>>(m_out_format);
  2552. add_output(None)->dtype(inpvar->dtype());
  2553. }
  2554. SymbolVar ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr::make(
  2555. VarNode* inpvar, TensorFormat inp_format, TensorFormat out_format) {
  2556. return inpvar->owner_graph()
  2557. ->insert_opr(std::make_unique<AbstractShuffleOpr>(
  2558. inpvar, inp_format, out_format))
  2559. ->output(0);
  2560. }
  2561. void ShuffleShuffleRemovePass::Impl::detect_shuffle_operations() {
  2562. auto rewriter = m_opt_state.graph().make_rewriter();
  2563. auto uniq_reader_check = UniqReaderCheck{m_opt_state.graph()};
  2564. auto try_reshape_shuffle = [&rewriter,
  2565. &uniq_reader_check](OperatorNodeBase* opr) {
  2566. // check shuffle
  2567. auto shuffle = try_cast_as_op<opr::Dimshuffle>(opr);
  2568. if (shuffle == nullptr)
  2569. return false;
  2570. auto&& param = shuffle->param();
  2571. if (param.pattern_len != 5)
  2572. return false;
  2573. bool is_nchw2nchw4 = param.pattern[0] == 0 && param.pattern[1] == 1 &&
  2574. param.pattern[2] == 3 && param.pattern[3] == 4 &&
  2575. param.pattern[4] == 2 &&
  2576. opr->output(0)->shape()[4] == 4;
  2577. if (!is_nchw2nchw4)
  2578. return false;
  2579. if (!uniq_reader_check(shuffle->input(0)))
  2580. return false;
  2581. // check reshape
  2582. auto reshape = try_cast_as_op<opr::Reshape>(opr->input(0)->owner_opr());
  2583. if (reshape == nullptr)
  2584. return false;
  2585. auto inp_var = rewriter.get_var(reshape->input(0));
  2586. auto abstract_shuffle = AbstractShuffleOpr::make(
  2587. inp_var, TensorFormat::NCHW, TensorFormat::NCHW4);
  2588. rewriter.replace_var(
  2589. opr->output(0), abstract_shuffle.node(),
  2590. mgb_cstr_log("replace reformat(nchw -> nchw4) to "
  2591. "AbstractShuffleOpr(nchw -> nchw4)."));
  2592. return true;
  2593. };
  2594. auto try_reshape_shuffle_reshape = [&rewriter, &uniq_reader_check](
  2595. OperatorNodeBase* opr) {
  2596. // check reshape
  2597. auto reshape1 = try_cast_as_op<opr::Reshape>(opr);
  2598. if (reshape1 == nullptr)
  2599. return false;
  2600. if (!uniq_reader_check(reshape1->input(0)))
  2601. return false;
  2602. // check shuffle
  2603. auto shuffle =
  2604. try_cast_as_op<opr::Dimshuffle>(opr->input(0)->owner_opr());
  2605. if (shuffle == nullptr)
  2606. return false;
  2607. auto&& param = shuffle->param();
  2608. if (param.pattern_len != 6)
  2609. return false;
  2610. bool is_nchw42nchw32 = param.pattern[0] == 0 && param.pattern[1] == 1 &&
  2611. param.pattern[2] == 3 && param.pattern[3] == 4 &&
  2612. param.pattern[4] == 2 && param.pattern[5] == 5 &&
  2613. shuffle->input(0)->shape()[5] == 4 &&
  2614. shuffle->input(0)->shape()[2] == 8;
  2615. bool is_nchw322nchw4 = param.pattern[0] == 0 && param.pattern[1] == 1 &&
  2616. param.pattern[2] == 4 && param.pattern[3] == 2 &&
  2617. param.pattern[4] == 3 && param.pattern[5] == 5 &&
  2618. shuffle->input(0)->shape()[4] == 8 &&
  2619. shuffle->input(0)->shape()[5] == 4;
  2620. if (!is_nchw42nchw32 && !is_nchw322nchw4)
  2621. return false;
  2622. if (!uniq_reader_check(shuffle->input(0)))
  2623. return false;
  2624. // check reshape
  2625. auto reshape2 =
  2626. try_cast_as_op<opr::Reshape>(shuffle->input(0)->owner_opr());
  2627. if (reshape2 == nullptr)
  2628. return false;
  2629. auto inp_var = rewriter.get_var(reshape2->input(0));
  2630. TensorFormat inp_format = is_nchw42nchw32 ? TensorFormat::NCHW4
  2631. : TensorFormat::NCHW32,
  2632. out_format = is_nchw42nchw32 ? TensorFormat::NCHW32
  2633. : TensorFormat::NCHW4;
  2634. auto abstract_shuffle =
  2635. AbstractShuffleOpr::make(inp_var, inp_format, out_format);
  2636. std::string reformat_type =
  2637. is_nchw42nchw32 ? "nchw4 -> nchw32" : "nchw32 -> nchw4";
  2638. rewriter.replace_var(opr->output(0), abstract_shuffle.node(),
  2639. mgb_cstr_log(ssprintf("replace reformat(%s) to "
  2640. "AbstractShuffleOpr(%s).",
  2641. reformat_type.c_str(),
  2642. reformat_type.c_str())
  2643. .c_str()));
  2644. return true;
  2645. };
  2646. auto try_shuffle_reshape = [&rewriter,
  2647. &uniq_reader_check](OperatorNodeBase* opr) {
  2648. // check reshape
  2649. auto reshape = try_cast_as_op<opr::Reshape>(opr);
  2650. if (reshape == nullptr)
  2651. return false;
  2652. if (!uniq_reader_check(reshape->input(0)))
  2653. return false;
  2654. // check shuffle
  2655. auto shuffle =
  2656. try_cast_as_op<opr::Dimshuffle>(opr->input(0)->owner_opr());
  2657. if (shuffle == nullptr)
  2658. return false;
  2659. auto&& param = shuffle->param();
  2660. if (param.pattern_len != 5)
  2661. return false;
  2662. bool is_nchw42nchw = param.pattern[0] == 0 && param.pattern[1] == 1 &&
  2663. param.pattern[2] == 4 && param.pattern[3] == 2 &&
  2664. param.pattern[4] == 3 &&
  2665. shuffle->input(0)->shape()[4] == 4;
  2666. if (!is_nchw42nchw)
  2667. return false;
  2668. auto inp_var = rewriter.get_var(shuffle->input(0));
  2669. auto abstract_shuffle = AbstractShuffleOpr::make(
  2670. inp_var, TensorFormat::NCHW4, TensorFormat::NCHW);
  2671. rewriter.replace_var(
  2672. opr->output(0), abstract_shuffle.node(),
  2673. mgb_cstr_log("replace reformat(nchw4 -> nchw) to "
  2674. "AbstractShuffleOpr(nchw4 -> nchw)."));
  2675. return true;
  2676. };
  2677. auto try_relayout_format = [&rewriter](OperatorNodeBase* opr) {
  2678. // check relayout format
  2679. auto reformat = try_cast_as_op<opr::RelayoutFormat>(opr);
  2680. if (reformat == nullptr)
  2681. return false;
  2682. auto&& param = reformat->param();
  2683. if (param.mode != opr::RelayoutFormat::Param::Mode::CHWN4_NCHW4 &&
  2684. param.mode != opr::RelayoutFormat::Param::Mode::NCHW4_CHWN4)
  2685. return false;
  2686. auto inp_var = rewriter.get_var(reformat->input(0));
  2687. cg::SymbolVar abstract_shuffle;
  2688. if (param.mode == opr::RelayoutFormat::Param::Mode::NCHW4_CHWN4) {
  2689. abstract_shuffle = AbstractShuffleOpr::make(
  2690. inp_var, TensorFormat::NCHW4, TensorFormat::CHWN4);
  2691. } else {
  2692. abstract_shuffle = AbstractShuffleOpr::make(
  2693. inp_var, TensorFormat::CHWN4, TensorFormat::NCHW4);
  2694. }
  2695. rewriter.replace_var(
  2696. opr->output(0), abstract_shuffle.node(),
  2697. mgb_cstr_log("replace reformat(nchw4 -> nchw) to "
  2698. "AbstractShuffleOpr(nchw4 -> nchw)."));
  2699. return true;
  2700. };
  2701. auto on_opr = [&try_reshape_shuffle, &try_shuffle_reshape,
  2702. &try_reshape_shuffle_reshape, &try_relayout_format,
  2703. &rewriter, &uniq_reader_check](OperatorNodeBase* opr) {
  2704. if (!try_reshape_shuffle_reshape(opr) && !try_reshape_shuffle(opr) &&
  2705. !try_shuffle_reshape(opr) && !try_relayout_format(opr)) {
  2706. auto new_opr = rewriter.auto_replace_outputs(opr);
  2707. uniq_reader_check.update_on_opr_auto_replace(opr, new_opr);
  2708. }
  2709. };
  2710. m_opt_state.graph().iter(on_opr);
  2711. rewriter.apply_inplace();
  2712. }
  2713. void ShuffleShuffleRemovePass::Impl::do_replace() {
  2714. auto rewriter = m_opt_state.graph().make_rewriter();
  2715. auto uniq_reader_check = UniqReaderCheck{m_opt_state.graph()};
  2716. ThinHashMap<VarNode*, VarNode*> var2endpoint;
  2717. ThinHashSet<VarNode*> trt_opr_inps;
  2718. SmallVector<OperatorNodeBase*> topo_order;
  2719. auto cb = [&topo_order, &trt_opr_inps](OperatorNodeBase* opr) {
  2720. topo_order.push_back(opr);
  2721. MGB_MARK_USED_VAR(trt_opr_inps);
  2722. #if MGB_ENABLE_TENSOR_RT
  2723. if (opr->same_type<opr::TensorRTOpr>()) {
  2724. for (auto&& inp : opr->input())
  2725. trt_opr_inps.insert(inp);
  2726. }
  2727. #endif
  2728. };
  2729. m_opt_state.graph().iter(cb);
  2730. for (auto&& opr : reverse_adaptor(topo_order)) {
  2731. if (opr->same_type<opr::TypeCvt>() ||
  2732. opr->same_type<AbstractShuffleOpr>()) {
  2733. auto find = var2endpoint.find(opr->output(0));
  2734. if (find != var2endpoint.end()) {
  2735. if (uniq_reader_check(opr->output(0))) {
  2736. var2endpoint[opr->input(0)] = find->second;
  2737. } else {
  2738. var2endpoint[opr->input(0)] = opr->output(0);
  2739. }
  2740. } else {
  2741. var2endpoint[opr->input(0)] = opr->output(0);
  2742. }
  2743. }
  2744. }
  2745. auto on_opr = [this, &rewriter, &uniq_reader_check, &trt_opr_inps,
  2746. &var2endpoint](OperatorNodeBase* opr) {
  2747. MGB_MARK_USED_VAR(trt_opr_inps);
  2748. bool cond_opr = opr->same_type<opr::TypeCvt>() ||
  2749. opr->same_type<AbstractShuffleOpr>();
  2750. if (cond_opr) {
  2751. bool cond_endpoint = var2endpoint[opr->input(0)] == opr->output(0);
  2752. if (!cond_endpoint)
  2753. return;
  2754. auto cur = opr;
  2755. auto var = opr->output(0), inp_var = opr->input(0);
  2756. bool force_folding_typecvt = false;
  2757. bool first_shuffle = false;
  2758. // initialize inp_format and out_format
  2759. TensorFormat out_format = TensorFormat::NCHW, inp_format = out_format;
  2760. megdnn::DType inp_dtype = cur->input(0)->dtype(),
  2761. out_dtype = cur->output(0)->dtype();
  2762. SmallVector<megdnn::DType> out_dtype_vec;
  2763. while (cond_opr) {
  2764. if (cur->same_type<AbstractShuffleOpr>()) {
  2765. auto shuffle = try_cast_as_op<AbstractShuffleOpr>(cur);
  2766. inp_format = shuffle->inp_format();
  2767. if (!first_shuffle) {
  2768. out_format = shuffle->out_format();
  2769. first_shuffle = true;
  2770. }
  2771. } else {
  2772. mgb_assert(cur->same_type<opr::TypeCvt>());
  2773. out_dtype_vec.push_back(cur->output(0)->dtype());
  2774. }
  2775. inp_var = cur->input(0);
  2776. bool cond_reader = uniq_reader_check(inp_var);
  2777. if (!cond_reader)
  2778. break;
  2779. cur = cur->input(0)->owner_opr();
  2780. cond_opr = cur->same_type<opr::TypeCvt>() ||
  2781. cur->same_type<AbstractShuffleOpr>();
  2782. }
  2783. std::reverse(out_dtype_vec.begin(), out_dtype_vec.end());
  2784. #if MGB_ENABLE_TENSOR_RT
  2785. force_folding_typecvt =
  2786. inp_var->owner_opr()->same_type<opr::TensorRTOpr>() ||
  2787. trt_opr_inps.count(var);
  2788. #endif
  2789. auto new_var = rewriter.get_var(inp_var);
  2790. if (inp_format != out_format) {
  2791. new_var = m_reformat[std::make_pair(inp_format, out_format)](
  2792. new_var);
  2793. }
  2794. if (force_folding_typecvt) {
  2795. inp_dtype = inp_var->dtype();
  2796. if (inp_dtype != out_dtype) {
  2797. auto type_cvt = opr::TypeCvt::make(new_var, out_dtype);
  2798. new_var = type_cvt.node();
  2799. }
  2800. } else {
  2801. if (out_dtype_vec.back() != var->dtype())
  2802. out_dtype_vec.push_back(var->dtype());
  2803. for (auto&& dtype : out_dtype_vec) {
  2804. auto type_cvt = opr::TypeCvt::make(new_var, dtype);
  2805. new_var = type_cvt.node();
  2806. }
  2807. }
  2808. rewriter.replace_var(
  2809. var, new_var,
  2810. mgb_cstr_log("replace Dimshuffle and TypeCvt chain"));
  2811. } else {
  2812. auto new_opr = rewriter.auto_replace_outputs(opr);
  2813. uniq_reader_check.update_on_opr_auto_replace(opr, new_opr);
  2814. }
  2815. };
  2816. m_opt_state.graph().iter(on_opr);
  2817. rewriter.apply_inplace();
  2818. }
  2819. const char* ShuffleShuffleRemovePass::name() const {
  2820. return mgb_cstr_log("shuffle shuffle remove pass");
  2821. }
  2822. void ShuffleShuffleRemovePass::apply(OptState& opt) const {
  2823. opt.set_var_replace_check_flag(VarReplaceCheckFlag::CHECK_SHAPE |
  2824. VarReplaceCheckFlag::CHECK_DTYPE);
  2825. Impl{opt};
  2826. }
  2827. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台