You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

testutil.cpp 48 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806
  1. // Copyright 2019 Tencent
  2. // SPDX-License-Identifier: BSD-3-Clause
  3. #include "testutil.h"
  4. #include "cpu.h"
  5. #include "layer.h"
  6. #include "mat.h"
  7. #include "prng.h"
  8. #include <limits.h>
  9. #include <stdio.h>
  10. #include <stdlib.h>
  11. #if NCNN_VULKAN
  12. #include "command.h"
  13. #include "gpu.h"
  14. #endif // NCNN_VULKAN
  15. static struct prng_rand_t g_prng_rand_state;
  16. void SRAND(int seed)
  17. {
  18. prng_srand(seed, &g_prng_rand_state);
  19. }
  20. uint64_t RAND()
  21. {
  22. return prng_rand(&g_prng_rand_state);
  23. }
  24. float RandomFloat(float a, float b)
  25. {
  26. float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
  27. float diff = b - a;
  28. float r = random * diff;
  29. float v = a + r;
  30. // generate denormal as zero
  31. if (v < 0.0001 && v > -0.0001)
  32. v = 0.f;
  33. return v;
  34. }
  35. int RandomInt(int a, int b)
  36. {
  37. float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
  38. int diff = b - a;
  39. float r = random * diff;
  40. return a + (int)r;
  41. }
  42. signed char RandomS8()
  43. {
  44. return (signed char)RandomInt(-127, 127);
  45. }
  46. void Randomize(ncnn::Mat& m, float a, float b)
  47. {
  48. for (size_t i = 0; i < m.total(); i++)
  49. {
  50. m[i] = RandomFloat(a, b);
  51. }
  52. }
  53. void RandomizeInt(ncnn::Mat& m, int a, int b)
  54. {
  55. for (size_t i = 0; i < m.total(); i++)
  56. {
  57. ((int*)m)[i] = RandomInt(a, b);
  58. }
  59. }
  60. void RandomizeS8(ncnn::Mat& m)
  61. {
  62. for (size_t i = 0; i < m.total(); i++)
  63. {
  64. ((signed char*)m)[i] = RandomS8();
  65. }
  66. }
  67. ncnn::Mat RandomMat(int w, float a, float b)
  68. {
  69. ncnn::Mat m(w);
  70. Randomize(m, a, b);
  71. return m;
  72. }
  73. ncnn::Mat RandomMat(int w, int h, float a, float b)
  74. {
  75. ncnn::Mat m(w, h);
  76. Randomize(m, a, b);
  77. return m;
  78. }
  79. ncnn::Mat RandomMat(int w, int h, int c, float a, float b)
  80. {
  81. ncnn::Mat m(w, h, c);
  82. Randomize(m, a, b);
  83. return m;
  84. }
  85. ncnn::Mat RandomMat(int w, int h, int d, int c, float a, float b)
  86. {
  87. ncnn::Mat m(w, h, d, c);
  88. Randomize(m, a, b);
  89. return m;
  90. }
  91. ncnn::Mat RandomIntMat(int w)
  92. {
  93. ncnn::Mat m(w);
  94. RandomizeInt(m);
  95. return m;
  96. }
  97. ncnn::Mat RandomIntMat(int w, int h)
  98. {
  99. ncnn::Mat m(w, h);
  100. RandomizeInt(m);
  101. return m;
  102. }
  103. ncnn::Mat RandomIntMat(int w, int h, int c)
  104. {
  105. ncnn::Mat m(w, h, c);
  106. RandomizeInt(m);
  107. return m;
  108. }
  109. ncnn::Mat RandomIntMat(int w, int h, int d, int c)
  110. {
  111. ncnn::Mat m(w, h, d, c);
  112. RandomizeInt(m);
  113. return m;
  114. }
  115. ncnn::Mat RandomS8Mat(int w)
  116. {
  117. ncnn::Mat m(w, (size_t)1u);
  118. RandomizeS8(m);
  119. return m;
  120. }
  121. ncnn::Mat RandomS8Mat(int w, int h)
  122. {
  123. ncnn::Mat m(w, h, (size_t)1u);
  124. RandomizeS8(m);
  125. return m;
  126. }
  127. ncnn::Mat RandomS8Mat(int w, int h, int c)
  128. {
  129. ncnn::Mat m(w, h, c, (size_t)1u);
  130. RandomizeS8(m);
  131. return m;
  132. }
  133. ncnn::Mat RandomS8Mat(int w, int h, int d, int c)
  134. {
  135. ncnn::Mat m(w, h, d, c, (size_t)1u);
  136. RandomizeS8(m);
  137. return m;
  138. }
  139. ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
  140. {
  141. ncnn::Mat weight_scales(m);
  142. for (int i = 0; i < m; ++i)
  143. {
  144. float min = mat[0], _max = mat[0];
  145. const float* ptr = (const float*)(mat.data) + i * ldx;
  146. for (int j = 0; j < k; ++j)
  147. {
  148. if (min > ptr[j])
  149. {
  150. min = ptr[j];
  151. }
  152. if (_max < ptr[j])
  153. {
  154. _max = ptr[j];
  155. }
  156. }
  157. const float abs_min = abs(min), abs_max = abs(_max);
  158. weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
  159. }
  160. return weight_scales;
  161. }
  162. bool NearlyEqual(float a, float b, float epsilon)
  163. {
  164. if (a == b)
  165. return true;
  166. float diff = (float)fabs(a - b);
  167. if (diff <= epsilon)
  168. return true;
  169. // relative error
  170. return diff < epsilon * std::max(fabs(a), fabs(b));
  171. }
  172. int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
  173. {
  174. #define CHECK_MEMBER(m) \
  175. if (a.m != b.m) \
  176. { \
  177. fprintf(stderr, #m " not match expect %d but got %d\n", (int)a.m, (int)b.m); \
  178. return -1; \
  179. }
  180. CHECK_MEMBER(dims)
  181. CHECK_MEMBER(w)
  182. CHECK_MEMBER(h)
  183. CHECK_MEMBER(d)
  184. CHECK_MEMBER(c)
  185. CHECK_MEMBER(elemsize)
  186. CHECK_MEMBER(elempack)
  187. #undef CHECK_MEMBER
  188. for (int q = 0; q < a.c; q++)
  189. {
  190. const ncnn::Mat ma = a.channel(q);
  191. const ncnn::Mat mb = b.channel(q);
  192. for (int z = 0; z < a.d; z++)
  193. {
  194. const ncnn::Mat da = ma.depth(z);
  195. const ncnn::Mat db = mb.depth(z);
  196. for (int i = 0; i < a.h; i++)
  197. {
  198. const float* pa = da.row(i);
  199. const float* pb = db.row(i);
  200. for (int j = 0; j < a.w; j++)
  201. {
  202. if (!NearlyEqual(pa[j], pb[j], epsilon))
  203. {
  204. fprintf(stderr, "value not match at c:%d d:%d h:%d w:%d expect %f but got %f\n", q, z, i, j, pa[j], pb[j]);
  205. return -1;
  206. }
  207. }
  208. }
  209. }
  210. }
  211. return 0;
  212. }
  213. int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
  214. {
  215. ncnn::Option opt;
  216. opt.num_threads = 1;
  217. if (a.elempack != 1)
  218. {
  219. ncnn::Mat a1;
  220. ncnn::convert_packing(a, a1, 1, opt);
  221. return CompareMat(a1, b, epsilon);
  222. }
  223. if (b.elempack != 1)
  224. {
  225. ncnn::Mat b1;
  226. ncnn::convert_packing(b, b1, 1, opt);
  227. return CompareMat(a, b1, epsilon);
  228. }
  229. if (a.elemsize == 2u)
  230. {
  231. ncnn::Mat a32;
  232. cast_float16_to_float32(a, a32, opt);
  233. return CompareMat(a32, b, epsilon);
  234. }
  235. if (a.elemsize == 1u)
  236. {
  237. ncnn::Mat a32;
  238. cast_int8_to_float32(a, a32, opt);
  239. return CompareMat(a32, b, epsilon);
  240. }
  241. if (b.elemsize == 2u)
  242. {
  243. ncnn::Mat b32;
  244. cast_float16_to_float32(b, b32, opt);
  245. return CompareMat(a, b32, epsilon);
  246. }
  247. if (b.elemsize == 1u)
  248. {
  249. ncnn::Mat b32;
  250. cast_int8_to_float32(b, b32, opt);
  251. return CompareMat(a, b32, epsilon);
  252. }
  253. return Compare(a, b, epsilon);
  254. }
  255. int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon)
  256. {
  257. if (a.size() != b.size())
  258. {
  259. fprintf(stderr, "output blob count not match %zu %zu\n", a.size(), b.size());
  260. return -1;
  261. }
  262. for (size_t i = 0; i < a.size(); i++)
  263. {
  264. if (CompareMat(a[i], b[i], epsilon))
  265. {
  266. fprintf(stderr, "output blob %zu not match\n", i);
  267. return -1;
  268. }
  269. }
  270. return 0;
  271. }
  272. static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const ncnn::Option& opt, const ncnn::Layer* op, int flag)
  273. {
  274. // clang-format off
  275. // *INDENT-OFF*
  276. #if NCNN_ARM82
  277. if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  278. {
  279. ncnn::cast_float32_to_float16(a, a4, opt);
  280. }
  281. else
  282. #endif // NCNN_ARM82
  283. #if NCNN_VFPV4
  284. if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  285. {
  286. ncnn::cast_float32_to_float16(a, a4, opt);
  287. }
  288. else
  289. #endif // NCNN_VFPV4
  290. #if NCNN_ZFH
  291. if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  292. {
  293. ncnn::cast_float32_to_float16(a, a4, opt);
  294. }
  295. else
  296. #endif // NCNN_ZFH
  297. #if NCNN_BF16
  298. if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  299. {
  300. ncnn::cast_float32_to_bfloat16(a, a4, opt);
  301. }
  302. else
  303. #endif // NCNN_BF16
  304. if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  305. {
  306. ncnn::cast_float32_to_float16(a, a4, opt);
  307. }
  308. else
  309. {
  310. a4 = a;
  311. }
  312. // *INDENT-ON*
  313. // clang-format on
  314. if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
  315. {
  316. // resolve dst_elempack
  317. int dims = a4.dims;
  318. int elemcount = 0;
  319. if (dims == 1) elemcount = a4.elempack * a4.w;
  320. if (dims == 2) elemcount = a4.elempack * a4.h;
  321. if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
  322. int elembits = a4.elembits();
  323. int dst_elempack = 1;
  324. if (elembits == 32)
  325. {
  326. #if NCNN_AVX512
  327. if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
  328. dst_elempack = 16;
  329. else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
  330. dst_elempack = 8;
  331. else if (elemcount % 4 == 0)
  332. dst_elempack = 4;
  333. #elif NCNN_AVX
  334. if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
  335. dst_elempack = 8;
  336. else if (elemcount % 4 == 0)
  337. dst_elempack = 4;
  338. #elif NCNN_RVV || NCNN_XTHEADVECTOR
  339. const int packn = ncnn::cpu_riscv_vlenb() / 4;
  340. if (elemcount % packn == 0)
  341. dst_elempack = packn;
  342. #else
  343. if (elemcount % 4 == 0)
  344. dst_elempack = 4;
  345. #endif
  346. }
  347. if (elembits == 16)
  348. {
  349. #if NCNN_ARM82
  350. if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic && op->support_fp16_storage)
  351. dst_elempack = 8;
  352. else if (elemcount % 4 == 0)
  353. dst_elempack = 4;
  354. #elif NCNN_RVV || NCNN_XTHEADVECTOR
  355. const int packn = ncnn::cpu_riscv_vlenb() / 2;
  356. if (elemcount % packn == 0)
  357. dst_elempack = packn;
  358. #else
  359. if (elemcount % 4 == 0)
  360. dst_elempack = 4;
  361. #endif
  362. }
  363. if (elembits == 8)
  364. {
  365. #if NCNN_RVV || NCNN_XTHEADVECTOR
  366. const int packn = ncnn::cpu_riscv_vlenb() / 1;
  367. if (elemcount % packn == 0)
  368. dst_elempack = packn;
  369. #else
  370. if (elemcount % 8 == 0)
  371. dst_elempack = 8;
  372. #endif
  373. }
  374. if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
  375. dst_elempack = 8;
  376. ncnn::Mat a4_packed;
  377. ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
  378. a4 = a4_packed;
  379. }
  380. return 0;
  381. }
  382. static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const ncnn::Option& opt, const ncnn::Layer* op, int flag)
  383. {
  384. ncnn::Mat c4_unpacked;
  385. if (c4.elempack != 1)
  386. {
  387. ncnn::convert_packing(c4, c4_unpacked, 1, opt);
  388. }
  389. else
  390. {
  391. c4_unpacked = c4;
  392. }
  393. // clang-format off
  394. // *INDENT-OFF*
  395. #if NCNN_ARM82
  396. if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
  397. {
  398. ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
  399. }
  400. else
  401. #endif // NCNN_ARM82
  402. #if NCNN_VFPV4
  403. if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
  404. {
  405. ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
  406. }
  407. else
  408. #endif // NCNN_VFPV4
  409. #if NCNN_ZFH
  410. if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && c4_unpacked.elembits() == 16)
  411. {
  412. ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
  413. }
  414. else
  415. #endif // NCNN_ZFH
  416. #if NCNN_BF16
  417. if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16)
  418. {
  419. ncnn::cast_bfloat16_to_float32(c4_unpacked, c, opt);
  420. }
  421. else
  422. #endif // NCNN_BF16
  423. if (opt.use_fp16_storage && op->support_fp16_storage && c4_unpacked.elembits() == 16)
  424. {
  425. ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
  426. }
  427. else
  428. {
  429. c = c4_unpacked;
  430. }
  431. // *INDENT-ON*
  432. // clang-format on
  433. return 0;
  434. }
  435. int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag)
  436. {
  437. ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
  438. if (func)
  439. {
  440. (*func)((ncnn::Layer*)op);
  441. }
  442. op->load_param(pd);
  443. if (op->one_blob_only && a.size() != 1)
  444. {
  445. fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
  446. delete op;
  447. return -1;
  448. }
  449. ncnn::ModelBinFromMatArray mb(weights.data());
  450. op->load_model(mb);
  451. ncnn::Option opt;
  452. opt.num_threads = 1;
  453. opt.lightmode = false;
  454. opt.use_packing_layout = false;
  455. opt.use_fp16_packed = false;
  456. opt.use_fp16_storage = false;
  457. opt.use_fp16_arithmetic = false;
  458. opt.use_shader_pack8 = false;
  459. opt.use_bf16_storage = false;
  460. opt.use_vulkan_compute = false;
  461. op->create_pipeline(opt);
  462. b.resize(top_blob_count);
  463. if (op->support_inplace)
  464. {
  465. for (size_t i = 0; i < a.size(); i++)
  466. {
  467. b[i] = a[i].clone();
  468. }
  469. op->forward_inplace(b, opt);
  470. }
  471. else
  472. {
  473. op->forward(a, b, opt);
  474. }
  475. op->destroy_pipeline(opt);
  476. delete op;
  477. return 0;
  478. }
  479. int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
  480. {
  481. ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
  482. if (!op->support_packing && _opt.use_packing_layout)
  483. {
  484. delete op;
  485. return 233;
  486. }
  487. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  488. {
  489. delete op;
  490. return 233;
  491. }
  492. if (func)
  493. {
  494. (*func)((ncnn::Layer*)op);
  495. }
  496. if (!top_shapes.empty())
  497. {
  498. op->bottom_shapes = a;
  499. op->top_shapes = top_shapes;
  500. }
  501. op->load_param(pd);
  502. if (op->one_blob_only && a.size() != 1)
  503. {
  504. fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
  505. delete op;
  506. return -1;
  507. }
  508. ncnn::ModelBinFromMatArray mb(weights.data());
  509. op->load_model(mb);
  510. ncnn::Option opt = _opt;
  511. opt.num_threads = 1;
  512. opt.use_vulkan_compute = false;
  513. op->create_pipeline(opt);
  514. if (!op->support_packing && _opt.use_packing_layout)
  515. {
  516. op->destroy_pipeline(opt);
  517. delete op;
  518. return 233;
  519. }
  520. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  521. {
  522. op->destroy_pipeline(opt);
  523. delete op;
  524. return 233;
  525. }
  526. std::vector<ncnn::Mat> a4(a.size());
  527. for (size_t i = 0; i < a4.size(); i++)
  528. {
  529. convert_to_optimal_layout(a[i], a4[i], opt, op, flag);
  530. }
  531. c.resize(top_blob_count);
  532. if (op->support_inplace)
  533. {
  534. for (size_t i = 0; i < a4.size(); i++)
  535. {
  536. c[i] = a4[i].clone();
  537. }
  538. op->forward_inplace(c, opt);
  539. }
  540. else
  541. {
  542. op->forward(a4, c, opt);
  543. }
  544. for (size_t i = 0; i < c.size(); i++)
  545. {
  546. convert_to_vanilla_layout(c[i], c[i], opt, op, flag);
  547. }
  548. op->destroy_pipeline(opt);
  549. delete op;
  550. return 0;
  551. }
  552. #if NCNN_VULKAN
  553. int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
  554. {
  555. if (!_opt.use_packing_layout)
  556. {
  557. // pack1 test is useless for gpu
  558. return 233;
  559. }
  560. ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
  561. if (!op)
  562. {
  563. return 233;
  564. }
  565. op->load_param(pd);
  566. if (!op->support_vulkan)
  567. {
  568. delete op;
  569. return 233;
  570. }
  571. ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
  572. op->vkdev = vkdev;
  573. if (func)
  574. {
  575. (*func)((ncnn::Layer*)op);
  576. }
  577. if (!top_shapes.empty())
  578. {
  579. op->bottom_shapes = a;
  580. op->top_shapes = top_shapes;
  581. }
  582. if (op->one_blob_only && a.size() != 1)
  583. {
  584. fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
  585. delete op;
  586. return -1;
  587. }
  588. ncnn::ModelBinFromMatArray mb(weights.data());
  589. op->load_model(mb);
  590. ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
  591. ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
  592. ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
  593. ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
  594. ncnn::Option opt = _opt;
  595. opt.num_threads = 1;
  596. opt.use_vulkan_compute = true;
  597. opt.blob_vkallocator = blob_vkallocator;
  598. opt.workspace_vkallocator = blob_vkallocator;
  599. opt.staging_vkallocator = staging_vkallocator;
  600. if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
  601. if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
  602. if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
  603. if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
  604. if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
  605. if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
  606. if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
  607. if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
  608. if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
  609. if (!vkdev->info.support_subgroup_ops()) opt.use_subgroup_ops = false;
  610. // FIXME fp16a may produce large error
  611. opt.use_fp16_arithmetic = false;
  612. op->create_pipeline(opt);
  613. if (!op->support_vulkan)
  614. {
  615. op->destroy_pipeline(opt);
  616. delete op;
  617. return 233;
  618. }
  619. {
  620. ncnn::VkTransfer cmd(vkdev);
  621. ncnn::Option opt_upload = opt;
  622. opt_upload.blob_vkallocator = &g_weight_vkallocator;
  623. opt_upload.workspace_vkallocator = &g_weight_vkallocator;
  624. opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
  625. op->upload_model(cmd, opt_upload);
  626. cmd.submit_and_wait();
  627. }
  628. d.resize(top_blob_count);
  629. {
  630. // forward
  631. ncnn::VkCompute cmd(vkdev);
  632. {
  633. // upload
  634. std::vector<ncnn::VkMat> a_gpu(a.size());
  635. for (size_t i = 0; i < a_gpu.size(); i++)
  636. {
  637. if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
  638. {
  639. // resolve dst_elempack
  640. int dims = a[i].dims;
  641. int elemcount = 0;
  642. if (dims == 1) elemcount = a[i].elempack * a[i].w;
  643. if (dims == 2) elemcount = a[i].elempack * a[i].h;
  644. if (dims == 3 || dims == 4) elemcount = a[i].elempack * a[i].c;
  645. const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;
  646. ncnn::Mat a4;
  647. ncnn::convert_packing(a[i], a4, dst_elempack, opt);
  648. ncnn::Option opt_upload = opt;
  649. opt_upload.use_fp16_packed = false;
  650. opt_upload.use_fp16_storage = false;
  651. opt_upload.use_int8_packed = false;
  652. opt_upload.use_int8_storage = false;
  653. cmd.record_clone(a4, a_gpu[i], opt_upload);
  654. }
  655. else
  656. {
  657. cmd.record_upload(a[i], a_gpu[i], opt);
  658. }
  659. }
  660. std::vector<ncnn::VkMat> d_gpu(top_blob_count);
  661. if (op->support_inplace)
  662. {
  663. op->forward_inplace(a_gpu, cmd, opt);
  664. d_gpu = a_gpu;
  665. }
  666. else
  667. {
  668. op->forward(a_gpu, d_gpu, cmd, opt);
  669. }
  670. // download
  671. for (size_t i = 0; i < d_gpu.size(); i++)
  672. {
  673. cmd.record_download(d_gpu[i], d[i], opt);
  674. }
  675. }
  676. cmd.submit_and_wait();
  677. }
  678. op->destroy_pipeline(opt);
  679. delete op;
  680. vkdev->reclaim_blob_allocator(blob_vkallocator);
  681. vkdev->reclaim_staging_allocator(staging_vkallocator);
  682. g_weight_vkallocator.clear();
  683. g_weight_staging_vkallocator.clear();
  684. return 0;
  685. }
  686. #endif // NCNN_VULKAN
  687. int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes, float epsilon, void (*func)(ncnn::Layer*), int flag)
  688. {
  689. // naive
  690. std::vector<ncnn::Mat> b;
  691. {
  692. int ret = test_layer_naive(typeindex, pd, weights, a, top_blob_count, b, func, flag);
  693. if (ret != 233 && ret != 0)
  694. {
  695. fprintf(stderr, "test_layer_naive failed\n");
  696. return -1;
  697. }
  698. }
  699. // cpu
  700. {
  701. std::vector<ncnn::Mat> c;
  702. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, std::vector<ncnn::Mat>(), func, flag);
  703. if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
  704. {
  705. fprintf(stderr, "test_layer_cpu failed\n");
  706. return -1;
  707. }
  708. }
  709. // cpu shape hint
  710. {
  711. std::vector<ncnn::Mat> c;
  712. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, b, func, flag);
  713. if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
  714. {
  715. fprintf(stderr, "test_layer_cpu failed with shape hint\n");
  716. return -1;
  717. }
  718. }
  719. #if NCNN_VULKAN
  720. // gpu
  721. if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
  722. {
  723. std::vector<ncnn::Mat> d;
  724. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, std::vector<ncnn::Mat>(), func, flag);
  725. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  726. {
  727. fprintf(stderr, "test_layer_gpu failed\n");
  728. return -1;
  729. }
  730. }
  731. // gpu shape hint
  732. if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
  733. {
  734. std::vector<ncnn::Mat> d;
  735. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, b, func, flag);
  736. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  737. {
  738. fprintf(stderr, "test_layer_gpu failed with shape hint\n");
  739. return -1;
  740. }
  741. }
  742. #endif // NCNN_VULKAN
  743. return 0;
  744. }
  745. int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag)
  746. {
  747. ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
  748. if (func)
  749. {
  750. (*func)((ncnn::Layer*)op);
  751. }
  752. op->load_param(pd);
  753. ncnn::ModelBinFromMatArray mb(weights.data());
  754. op->load_model(mb);
  755. ncnn::Option opt;
  756. opt.num_threads = 1;
  757. opt.lightmode = false;
  758. opt.use_packing_layout = false;
  759. opt.use_fp16_packed = false;
  760. opt.use_fp16_storage = false;
  761. opt.use_fp16_arithmetic = false;
  762. opt.use_shader_pack8 = false;
  763. opt.use_bf16_storage = false;
  764. opt.use_vulkan_compute = false;
  765. op->create_pipeline(opt);
  766. if (op->support_inplace)
  767. {
  768. b = a.clone();
  769. op->forward_inplace(b, opt);
  770. }
  771. else
  772. {
  773. op->forward(a, b, opt);
  774. }
  775. op->destroy_pipeline(opt);
  776. delete op;
  777. return 0;
  778. }
  779. int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
  780. {
  781. ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
  782. if (!op->support_packing && _opt.use_packing_layout)
  783. {
  784. delete op;
  785. return 233;
  786. }
  787. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  788. {
  789. delete op;
  790. return 233;
  791. }
  792. if (func)
  793. {
  794. (*func)((ncnn::Layer*)op);
  795. }
  796. if (top_shape.dims)
  797. {
  798. op->bottom_shapes.resize(1);
  799. op->top_shapes.resize(1);
  800. op->bottom_shapes[0] = a;
  801. op->top_shapes[0] = top_shape;
  802. }
  803. op->load_param(pd);
  804. ncnn::ModelBinFromMatArray mb(weights.data());
  805. op->load_model(mb);
  806. ncnn::Option opt = _opt;
  807. opt.num_threads = 1;
  808. opt.use_vulkan_compute = false;
  809. op->create_pipeline(opt);
  810. if (!op->support_packing && _opt.use_packing_layout)
  811. {
  812. op->destroy_pipeline(opt);
  813. delete op;
  814. return 233;
  815. }
  816. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  817. {
  818. op->destroy_pipeline(opt);
  819. delete op;
  820. return 233;
  821. }
  822. ncnn::Mat a4;
  823. convert_to_optimal_layout(a, a4, opt, op, flag);
  824. if (op->support_inplace)
  825. {
  826. c = a4.clone();
  827. op->forward_inplace(c, opt);
  828. }
  829. else
  830. {
  831. op->forward(a4, c, opt);
  832. }
  833. convert_to_vanilla_layout(c, c, opt, op, flag);
  834. op->destroy_pipeline(opt);
  835. delete op;
  836. return 0;
  837. }
  838. #if NCNN_VULKAN
  839. int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
  840. {
  841. if (!_opt.use_packing_layout)
  842. {
  843. // pack1 test is useless for gpu
  844. return 233;
  845. }
  846. ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
  847. if (!op)
  848. {
  849. return 233;
  850. }
  851. op->load_param(pd);
  852. if (!op->support_vulkan)
  853. {
  854. delete op;
  855. return 233;
  856. }
  857. ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
  858. op->vkdev = vkdev;
  859. if (func)
  860. {
  861. (*func)((ncnn::Layer*)op);
  862. }
  863. if (top_shape.dims)
  864. {
  865. op->bottom_shapes.resize(1);
  866. op->top_shapes.resize(1);
  867. op->bottom_shapes[0] = a;
  868. op->top_shapes[0] = top_shape;
  869. }
  870. ncnn::ModelBinFromMatArray mb(weights.data());
  871. op->load_model(mb);
  872. ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
  873. ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
  874. ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
  875. ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
  876. ncnn::Option opt = _opt;
  877. opt.num_threads = 1;
  878. opt.use_vulkan_compute = true;
  879. opt.blob_vkallocator = blob_vkallocator;
  880. opt.workspace_vkallocator = blob_vkallocator;
  881. opt.staging_vkallocator = staging_vkallocator;
  882. if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
  883. if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
  884. if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
  885. if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
  886. if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
  887. if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
  888. if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
  889. if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
  890. if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
  891. if (!vkdev->info.support_subgroup_ops()) opt.use_subgroup_ops = false;
  892. // FIXME fp16a may produce large error
  893. opt.use_fp16_arithmetic = false;
  894. op->create_pipeline(opt);
  895. if (!op->support_vulkan)
  896. {
  897. op->destroy_pipeline(opt);
  898. delete op;
  899. return 233;
  900. }
  901. {
  902. ncnn::VkTransfer cmd(vkdev);
  903. ncnn::Option opt_upload = opt;
  904. opt_upload.blob_vkallocator = &g_weight_vkallocator;
  905. opt_upload.workspace_vkallocator = &g_weight_vkallocator;
  906. opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
  907. op->upload_model(cmd, opt_upload);
  908. cmd.submit_and_wait();
  909. }
  910. {
  911. // forward
  912. ncnn::VkCompute cmd(vkdev);
  913. {
  914. // upload
  915. ncnn::VkMat a_gpu;
  916. if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
  917. {
  918. // resolve dst_elempack
  919. int dims = a.dims;
  920. int elemcount = 0;
  921. if (dims == 1) elemcount = a.elempack * a.w;
  922. if (dims == 2) elemcount = a.elempack * a.h;
  923. if (dims == 3 || dims == 4) elemcount = a.elempack * a.c;
  924. const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;
  925. ncnn::Mat a4;
  926. ncnn::convert_packing(a, a4, dst_elempack, opt);
  927. ncnn::Option opt_upload = opt;
  928. opt_upload.use_fp16_packed = false;
  929. opt_upload.use_fp16_storage = false;
  930. opt_upload.use_int8_packed = false;
  931. opt_upload.use_int8_storage = false;
  932. cmd.record_clone(a4, a_gpu, opt_upload);
  933. }
  934. else
  935. {
  936. cmd.record_upload(a, a_gpu, opt);
  937. }
  938. ncnn::VkMat d_gpu;
  939. if (op->support_inplace)
  940. {
  941. op->forward_inplace(a_gpu, cmd, opt);
  942. d_gpu = a_gpu;
  943. }
  944. else
  945. {
  946. op->forward(a_gpu, d_gpu, cmd, opt);
  947. }
  948. // download
  949. cmd.record_download(d_gpu, d, opt);
  950. }
  951. cmd.submit_and_wait();
  952. }
  953. op->destroy_pipeline(opt);
  954. delete op;
  955. vkdev->reclaim_blob_allocator(blob_vkallocator);
  956. vkdev->reclaim_staging_allocator(staging_vkallocator);
  957. g_weight_vkallocator.clear();
  958. g_weight_staging_vkallocator.clear();
  959. return 0;
  960. }
  961. #endif // NCNN_VULKAN
  962. int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape, float epsilon, void (*func)(ncnn::Layer*), int flag)
  963. {
  964. // naive
  965. ncnn::Mat b;
  966. {
  967. int ret = test_layer_naive(typeindex, pd, weights, a, b, func, flag);
  968. if (ret != 233 && ret != 0)
  969. {
  970. fprintf(stderr, "test_layer_naive failed\n");
  971. return -1;
  972. }
  973. }
  974. // cpu
  975. {
  976. ncnn::Mat c;
  977. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, ncnn::Mat(), func, flag);
  978. if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
  979. {
  980. fprintf(stderr, "test_layer_cpu failed\n");
  981. return -1;
  982. }
  983. }
  984. // cpu shape hint
  985. {
  986. ncnn::Mat c;
  987. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, b, func, flag);
  988. if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
  989. {
  990. fprintf(stderr, "test_layer_cpu failed with shape hint\n");
  991. return -1;
  992. }
  993. }
  994. #if NCNN_VULKAN
  995. // gpu
  996. if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
  997. {
  998. ncnn::Mat d;
  999. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, ncnn::Mat(), func, flag);
  1000. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  1001. {
  1002. fprintf(stderr, "test_layer_gpu failed\n");
  1003. return -1;
  1004. }
  1005. }
  1006. // gpu shape hint
  1007. if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
  1008. {
  1009. ncnn::Mat d;
  1010. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, b, func, flag);
  1011. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  1012. {
  1013. fprintf(stderr, "test_layer_gpu failed with shape hint\n");
  1014. return -1;
  1015. }
  1016. }
  1017. #endif // NCNN_VULKAN
  1018. return 0;
  1019. }
  1020. int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
  1021. {
  1022. // fp16 representation
  1023. std::vector<ncnn::Mat> a_fp16;
  1024. if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  1025. {
  1026. a_fp16.resize(a.size());
  1027. for (size_t j = 0; j < a.size(); j++)
  1028. {
  1029. ncnn::Mat tmp;
  1030. ncnn::cast_float32_to_bfloat16(a[j], tmp, opt);
  1031. ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt);
  1032. }
  1033. }
  1034. else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  1035. {
  1036. a_fp16.resize(a.size());
  1037. for (size_t j = 0; j < a.size(); j++)
  1038. {
  1039. ncnn::Mat tmp;
  1040. ncnn::cast_float32_to_float16(a[j], tmp, opt);
  1041. ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt);
  1042. }
  1043. }
  1044. else
  1045. {
  1046. a_fp16 = a;
  1047. }
  1048. std::vector<ncnn::Mat> weights_fp16;
  1049. float epsilon_fp16;
  1050. if (opt.use_bf16_storage)
  1051. {
  1052. weights_fp16.resize(weights.size());
  1053. for (size_t j = 0; j < weights.size(); j++)
  1054. {
  1055. if (weights[j].elembits() != 32)
  1056. {
  1057. weights_fp16[j] = weights[j];
  1058. continue;
  1059. }
  1060. ncnn::Mat tmp;
  1061. ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
  1062. ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
  1063. }
  1064. epsilon_fp16 = epsilon * 100; // 0.1
  1065. }
  1066. else if (opt.use_fp16_packed || opt.use_fp16_storage)
  1067. {
  1068. weights_fp16.resize(weights.size());
  1069. for (size_t j = 0; j < weights.size(); j++)
  1070. {
  1071. if (weights[j].elembits() != 32)
  1072. {
  1073. weights_fp16[j] = weights[j];
  1074. continue;
  1075. }
  1076. ncnn::Mat tmp;
  1077. ncnn::cast_float32_to_float16(weights[j], tmp, opt);
  1078. ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
  1079. }
  1080. epsilon_fp16 = epsilon * 100; // 0.1
  1081. }
  1082. else
  1083. {
  1084. weights_fp16 = weights;
  1085. epsilon_fp16 = epsilon;
  1086. }
  1087. if (opt.use_fp16_arithmetic)
  1088. {
  1089. epsilon_fp16 = epsilon * 1000; // 1.0
  1090. }
  1091. std::vector<ncnn::Mat> top_shapes;
  1092. int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
  1093. if (ret != 0)
  1094. {
  1095. fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
  1096. return ret;
  1097. }
  1098. return 0;
  1099. }
  1100. int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
  1101. {
  1102. // fp16 representation
  1103. ncnn::Mat a_fp16;
  1104. if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  1105. {
  1106. ncnn::Mat tmp;
  1107. ncnn::cast_float32_to_bfloat16(a, tmp, opt);
  1108. ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt);
  1109. }
  1110. else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
  1111. {
  1112. ncnn::Mat tmp;
  1113. ncnn::cast_float32_to_float16(a, tmp, opt);
  1114. ncnn::cast_float16_to_float32(tmp, a_fp16, opt);
  1115. }
  1116. else
  1117. {
  1118. a_fp16 = a;
  1119. }
  1120. std::vector<ncnn::Mat> weights_fp16;
  1121. float epsilon_fp16;
  1122. if (opt.use_bf16_storage)
  1123. {
  1124. weights_fp16.resize(weights.size());
  1125. for (size_t j = 0; j < weights.size(); j++)
  1126. {
  1127. if (weights[j].elembits() != 32)
  1128. {
  1129. weights_fp16[j] = weights[j];
  1130. continue;
  1131. }
  1132. ncnn::Mat tmp;
  1133. ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
  1134. ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
  1135. }
  1136. epsilon_fp16 = epsilon * 100; // 0.1
  1137. }
  1138. else if (opt.use_fp16_packed || opt.use_fp16_storage)
  1139. {
  1140. weights_fp16.resize(weights.size());
  1141. for (size_t j = 0; j < weights.size(); j++)
  1142. {
  1143. if (weights[j].elembits() != 32)
  1144. {
  1145. weights_fp16[j] = weights[j];
  1146. continue;
  1147. }
  1148. ncnn::Mat tmp;
  1149. ncnn::cast_float32_to_float16(weights[j], tmp, opt);
  1150. ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
  1151. }
  1152. epsilon_fp16 = epsilon * 100; // 0.1
  1153. }
  1154. else
  1155. {
  1156. weights_fp16 = weights;
  1157. epsilon_fp16 = epsilon;
  1158. }
  1159. if (opt.use_fp16_arithmetic)
  1160. {
  1161. epsilon_fp16 = epsilon * 1000; // 1.0
  1162. }
  1163. ncnn::Mat top_shape;
  1164. int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
  1165. if (ret != 0)
  1166. {
  1167. fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
  1168. return ret;
  1169. }
  1170. return 0;
  1171. }
  1172. int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
  1173. {
  1174. // pack fp16p fp16s fp16a bf16s shader8
  1175. const int options[][6] = {
  1176. {0, 0, 0, 0, 0, 0},
  1177. {0, 0, 1, 0, 0, 0},
  1178. {0, 0, 1, 1, 1, 0},
  1179. {1, 0, 0, 0, 0, 0},
  1180. {1, 1, 0, 0, 1, 0},
  1181. {1, 0, 1, 0, 0, 1},
  1182. {1, 1, 1, 1, 0, 0},
  1183. {1, 1, 1, 1, 1, 1},
  1184. };
  1185. const int opt_count = sizeof(options) / sizeof(options[0]);
  1186. for (int i = 0; i < opt_count; i++)
  1187. {
  1188. ncnn::Option opt;
  1189. opt.num_threads = 1;
  1190. opt.use_packing_layout = options[i][0];
  1191. opt.use_fp16_packed = options[i][1];
  1192. opt.use_fp16_storage = options[i][2];
  1193. opt.use_fp16_arithmetic = options[i][3];
  1194. opt.use_bf16_storage = options[i][4];
  1195. opt.use_shader_pack8 = options[i][5];
  1196. int ret = test_layer_opt(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
  1197. if (ret != 0)
  1198. return ret;
  1199. }
  1200. return 0;
  1201. }
  1202. int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
  1203. {
  1204. // pack fp16p fp16s fp16a bf16s shader8
  1205. const int options[][6] = {
  1206. {0, 0, 0, 0, 0, 0},
  1207. {0, 0, 1, 0, 0, 0},
  1208. {0, 0, 1, 1, 1, 0},
  1209. {1, 0, 0, 0, 0, 0},
  1210. {1, 1, 0, 0, 1, 0},
  1211. {1, 0, 1, 0, 0, 1},
  1212. {1, 1, 1, 1, 0, 0},
  1213. {1, 1, 1, 1, 1, 1},
  1214. };
  1215. const int opt_count = sizeof(options) / sizeof(options[0]);
  1216. for (int i = 0; i < opt_count; i++)
  1217. {
  1218. ncnn::Option opt;
  1219. opt.num_threads = 1;
  1220. opt.use_packing_layout = options[i][0];
  1221. opt.use_fp16_packed = options[i][1];
  1222. opt.use_fp16_storage = options[i][2];
  1223. opt.use_fp16_arithmetic = options[i][3];
  1224. opt.use_bf16_storage = options[i][4];
  1225. opt.use_shader_pack8 = options[i][5];
  1226. int ret = test_layer_opt(layer_type, pd, weights, opt, a, epsilon, func, flag);
  1227. if (ret != 0)
  1228. return ret;
  1229. }
  1230. return 0;
  1231. }
  1232. class TestOOMAllocator : public ncnn::UnlockedPoolAllocator
  1233. {
  1234. public:
  1235. TestOOMAllocator();
  1236. virtual void* fastMalloc(size_t size);
  1237. virtual void fastFree(void* ptr);
  1238. ncnn::Mutex lock;
  1239. int counter;
  1240. int failid;
  1241. };
  1242. TestOOMAllocator::TestOOMAllocator()
  1243. {
  1244. counter = 0;
  1245. failid = INT_MAX;
  1246. }
  1247. void* TestOOMAllocator::fastMalloc(size_t size)
  1248. {
  1249. lock.lock();
  1250. void* ptr;
  1251. if (counter == failid)
  1252. {
  1253. ptr = 0;
  1254. }
  1255. else
  1256. {
  1257. ptr = ncnn::UnlockedPoolAllocator::fastMalloc(size);
  1258. }
  1259. counter++;
  1260. lock.unlock();
  1261. return ptr;
  1262. }
  1263. void TestOOMAllocator::fastFree(void* ptr)
  1264. {
  1265. lock.lock();
  1266. ncnn::UnlockedPoolAllocator::fastFree(ptr);
  1267. lock.unlock();
  1268. }
  1269. int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, int flag)
  1270. {
  1271. int typeindex = ncnn::layer_to_index(layer_type);
  1272. if (typeindex == -1)
  1273. return -1;
  1274. ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
  1275. if (!op->support_packing && _opt.use_packing_layout)
  1276. {
  1277. delete op;
  1278. return 233;
  1279. }
  1280. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  1281. {
  1282. delete op;
  1283. return 233;
  1284. }
  1285. op->load_param(pd);
  1286. if (op->one_blob_only && a.size() != 1)
  1287. {
  1288. fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
  1289. delete op;
  1290. return -1;
  1291. }
  1292. ncnn::ModelBinFromMatArray mb(weights.data());
  1293. op->load_model(mb);
  1294. ncnn::Option opt = _opt;
  1295. opt.num_threads = 1;
  1296. opt.use_vulkan_compute = false;
  1297. op->create_pipeline(opt);
  1298. if (!op->support_packing && _opt.use_packing_layout)
  1299. {
  1300. op->destroy_pipeline(opt);
  1301. delete op;
  1302. return 233;
  1303. }
  1304. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  1305. {
  1306. op->destroy_pipeline(opt);
  1307. delete op;
  1308. return 233;
  1309. }
  1310. std::vector<ncnn::Mat> a4(a.size());
  1311. for (size_t i = 0; i < a4.size(); i++)
  1312. {
  1313. convert_to_optimal_layout(a[i], a4[i], opt, op, flag);
  1314. }
  1315. TestOOMAllocator test_oom_allocator;
  1316. opt.blob_allocator = &test_oom_allocator;
  1317. opt.workspace_allocator = &test_oom_allocator;
  1318. std::vector<ncnn::Mat> c;
  1319. c.resize(top_blob_count);
  1320. if (op->support_inplace)
  1321. {
  1322. for (size_t i = 0; i < a4.size(); i++)
  1323. {
  1324. c[i] = a4[i].clone();
  1325. }
  1326. op->forward_inplace(c, opt);
  1327. }
  1328. else
  1329. {
  1330. op->forward(a4, c, opt);
  1331. }
  1332. for (int i = 0; i < top_blob_count; i++)
  1333. {
  1334. c[i].release();
  1335. }
  1336. const int alloc_count = test_oom_allocator.counter;
  1337. for (int i = 0; i < alloc_count; i++)
  1338. {
  1339. test_oom_allocator.counter = 0;
  1340. test_oom_allocator.failid = i;
  1341. int ret = 0;
  1342. if (op->support_inplace)
  1343. {
  1344. for (size_t i = 0; i < a4.size(); i++)
  1345. {
  1346. c[i] = a4[i].clone();
  1347. }
  1348. ret = op->forward_inplace(c, opt);
  1349. }
  1350. else
  1351. {
  1352. ret = op->forward(a4, c, opt);
  1353. }
  1354. for (int i = 0; i < top_blob_count; i++)
  1355. {
  1356. c[i].release();
  1357. }
  1358. if (ret != -100)
  1359. {
  1360. fprintf(stderr, "oom not catched %d/%d\n", i, alloc_count);
  1361. op->destroy_pipeline(opt);
  1362. delete op;
  1363. return -1;
  1364. }
  1365. }
  1366. op->destroy_pipeline(opt);
  1367. delete op;
  1368. return 0;
  1369. }
  1370. int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, int flag)
  1371. {
  1372. int typeindex = ncnn::layer_to_index(layer_type);
  1373. if (typeindex == -1)
  1374. return -1;
  1375. ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
  1376. if (!op->support_packing && _opt.use_packing_layout)
  1377. {
  1378. delete op;
  1379. return 233;
  1380. }
  1381. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  1382. {
  1383. delete op;
  1384. return 233;
  1385. }
  1386. op->load_param(pd);
  1387. ncnn::ModelBinFromMatArray mb(weights.data());
  1388. op->load_model(mb);
  1389. ncnn::Option opt = _opt;
  1390. opt.num_threads = 1;
  1391. opt.use_vulkan_compute = false;
  1392. op->create_pipeline(opt);
  1393. if (!op->support_packing && _opt.use_packing_layout)
  1394. {
  1395. op->destroy_pipeline(opt);
  1396. delete op;
  1397. return 233;
  1398. }
  1399. if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
  1400. {
  1401. op->destroy_pipeline(opt);
  1402. delete op;
  1403. return 233;
  1404. }
  1405. ncnn::Mat a4;
  1406. convert_to_optimal_layout(a, a4, opt, op, flag);
  1407. TestOOMAllocator test_oom_allocator;
  1408. opt.blob_allocator = &test_oom_allocator;
  1409. opt.workspace_allocator = &test_oom_allocator;
  1410. ncnn::Mat c;
  1411. if (op->support_inplace)
  1412. {
  1413. c = a4.clone();
  1414. op->forward_inplace(c, opt);
  1415. }
  1416. else
  1417. {
  1418. op->forward(a4, c, opt);
  1419. }
  1420. c.release();
  1421. const int alloc_count = test_oom_allocator.counter;
  1422. for (int i = 0; i < alloc_count; i++)
  1423. {
  1424. test_oom_allocator.counter = 0;
  1425. test_oom_allocator.failid = i;
  1426. int ret = 0;
  1427. if (op->support_inplace)
  1428. {
  1429. c = a4.clone();
  1430. ret = op->forward_inplace(c, opt);
  1431. }
  1432. else
  1433. {
  1434. ret = op->forward(a4, c, opt);
  1435. }
  1436. c.release();
  1437. if (ret != -100)
  1438. {
  1439. fprintf(stderr, "oom not catched %d/%d\n", i, alloc_count);
  1440. op->destroy_pipeline(opt);
  1441. delete op;
  1442. return -1;
  1443. }
  1444. }
  1445. op->destroy_pipeline(opt);
  1446. delete op;
  1447. return 0;
  1448. }
  1449. int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, int flag)
  1450. {
  1451. // pack fp16p fp16s fp16a bf16s shader8
  1452. const int options[][6] = {
  1453. {0, 0, 0, 0, 0, 0},
  1454. {0, 0, 1, 0, 0, 0},
  1455. {0, 0, 1, 1, 1, 0},
  1456. {1, 0, 0, 0, 0, 0},
  1457. {1, 1, 0, 0, 1, 0},
  1458. {1, 0, 1, 0, 0, 1},
  1459. {1, 1, 1, 1, 0, 0},
  1460. {1, 1, 1, 1, 1, 1},
  1461. };
  1462. const int opt_count = sizeof(options) / sizeof(options[0]);
  1463. for (int i = 0; i < opt_count; i++)
  1464. {
  1465. ncnn::Option opt;
  1466. opt.num_threads = 1;
  1467. opt.use_packing_layout = options[i][0];
  1468. opt.use_fp16_packed = options[i][1];
  1469. opt.use_fp16_storage = options[i][2];
  1470. opt.use_fp16_arithmetic = options[i][3];
  1471. opt.use_bf16_storage = options[i][4];
  1472. opt.use_shader_pack8 = options[i][5];
  1473. int ret = test_layer_oom_opt(layer_type, pd, weights, opt, a, top_blob_count, flag);
  1474. if (ret != 233 && ret != 0)
  1475. return ret;
  1476. }
  1477. return 0;
  1478. }
  1479. int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, int flag)
  1480. {
  1481. // pack fp16p fp16s fp16a bf16s shader8
  1482. const int options[][6] = {
  1483. {0, 0, 0, 0, 0, 0},
  1484. {0, 0, 1, 0, 0, 0},
  1485. {0, 0, 1, 1, 1, 0},
  1486. {1, 0, 0, 0, 0, 0},
  1487. {1, 1, 0, 0, 1, 0},
  1488. {1, 0, 1, 0, 0, 1},
  1489. {1, 1, 1, 1, 0, 0},
  1490. {1, 1, 1, 1, 1, 1},
  1491. };
  1492. const int opt_count = sizeof(options) / sizeof(options[0]);
  1493. for (int i = 0; i < opt_count; i++)
  1494. {
  1495. ncnn::Option opt;
  1496. opt.num_threads = 1;
  1497. opt.use_packing_layout = options[i][0];
  1498. opt.use_fp16_packed = options[i][1];
  1499. opt.use_fp16_storage = options[i][2];
  1500. opt.use_fp16_arithmetic = options[i][3];
  1501. opt.use_bf16_storage = options[i][4];
  1502. opt.use_shader_pack8 = options[i][5];
  1503. int ret = test_layer_oom_opt(layer_type, pd, weights, opt, a, flag);
  1504. if (ret != 233 && ret != 0)
  1505. return ret;
  1506. }
  1507. return 0;
  1508. }