You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ncnn2table.cpp 58 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828
  1. // Copyright 2019 BUG1989 (https://github.com/BUG1989/) Long-term support.
  2. // Copyright 2019 JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
  3. // Copyright 2021 Tencent
  4. // SPDX-License-Identifier: BSD-3-Clause
  5. #ifdef _MSC_VER
  6. #define _CRT_SECURE_NO_DEPRECATE
  7. #endif
  8. #include <float.h>
  9. #include <limits.h>
  10. #include <math.h>
  11. #include <stdio.h>
  12. #include <stdint.h>
  13. #include <stdlib.h>
  14. #include <string.h>
  15. #if defined(USE_NCNN_SIMPLEOCV)
  16. #include "simpleocv.h"
  17. #elif defined(USE_LOCAL_IMREADWRITE)
  18. #include "imreadwrite.h"
  19. #else
  20. #include <opencv2/core/core.hpp>
  21. #include <opencv2/highgui/highgui.hpp>
  22. #endif
  23. #include <string>
  24. #include <vector>
  25. // npy format header
  26. #include "npy.hpp"
  27. // ncnn public header
  28. #include "benchmark.h"
  29. #include "cpu.h"
  30. #include "net.h"
  31. // ncnn private header
  32. #include "layer/convolution.h"
  33. #include "layer/convolutiondepthwise.h"
  34. #include "layer/innerproduct.h"
  35. class QuantBlobStat
  36. {
  37. public:
  38. QuantBlobStat()
  39. {
  40. threshold = 0.f;
  41. absmax = 0.f;
  42. total = 0;
  43. }
  44. public:
  45. float threshold;
  46. float absmax;
  47. // ACIQ
  48. int total;
  49. // KL
  50. std::vector<uint64_t> histogram;
  51. std::vector<float> histogram_normed;
  52. };
  53. class QuantNet : public ncnn::Net
  54. {
  55. public:
  56. QuantNet();
  57. std::vector<ncnn::Blob>& blobs;
  58. std::vector<ncnn::Layer*>& layers;
  59. public:
  60. std::vector<std::vector<std::string> > listspaths;
  61. std::vector<std::vector<float> > means;
  62. std::vector<std::vector<float> > norms;
  63. std::vector<std::vector<int> > shapes;
  64. std::vector<int> type_to_pixels;
  65. int quantize_num_threads;
  66. int file_type;
  67. public:
  68. int init();
  69. void print_quant_info() const;
  70. int save_table(const char* tablepath);
  71. int quantize_KL();
  72. int quantize_ACIQ();
  73. int quantize_EQ();
  74. public:
  75. std::vector<int> input_blobs;
  76. std::vector<int> conv_layers;
  77. std::vector<int> conv_bottom_blobs;
  78. std::vector<int> conv_top_blobs;
  79. // result
  80. std::vector<QuantBlobStat> quant_blob_stats;
  81. std::vector<ncnn::Mat> weight_scales;
  82. std::vector<ncnn::Mat> bottom_blob_scales;
  83. };
  84. QuantNet::QuantNet()
  85. : blobs(mutable_blobs()), layers(mutable_layers())
  86. {
  87. quantize_num_threads = ncnn::get_cpu_count();
  88. }
  89. int QuantNet::init()
  90. {
  91. // find all input layers
  92. for (int i = 0; i < (int)layers.size(); i++)
  93. {
  94. const ncnn::Layer* layer = layers[i];
  95. if (layer->type == "Input")
  96. {
  97. input_blobs.push_back(layer->tops[0]);
  98. }
  99. }
  100. // find all conv layers
  101. for (int i = 0; i < (int)layers.size(); i++)
  102. {
  103. const ncnn::Layer* layer = layers[i];
  104. if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct")
  105. {
  106. conv_layers.push_back(i);
  107. conv_bottom_blobs.push_back(layer->bottoms[0]);
  108. conv_top_blobs.push_back(layer->tops[0]);
  109. }
  110. }
  111. const int conv_layer_count = (int)conv_layers.size();
  112. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  113. quant_blob_stats.resize(conv_bottom_blob_count);
  114. weight_scales.resize(conv_layer_count);
  115. bottom_blob_scales.resize(conv_bottom_blob_count);
  116. return 0;
  117. }
  118. int QuantNet::save_table(const char* tablepath)
  119. {
  120. FILE* fp = fopen(tablepath, "wb");
  121. if (!fp)
  122. {
  123. fprintf(stderr, "fopen %s failed\n", tablepath);
  124. return -1;
  125. }
  126. const int conv_layer_count = (int)conv_layers.size();
  127. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  128. fprintf(stdout, "param:%d\n", conv_layer_count);
  129. for (int i = 0; i < conv_layer_count; i++)
  130. {
  131. const ncnn::Mat& weight_scale = weight_scales[i];
  132. fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
  133. for (int j = 0; j < weight_scale.w; j++)
  134. {
  135. fprintf(fp, "%f ", weight_scale[j]);
  136. }
  137. fprintf(fp, "\n");
  138. }
  139. for (int i = 0; i < conv_bottom_blob_count; i++)
  140. {
  141. const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  142. fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
  143. for (int j = 0; j < bottom_blob_scale.w; j++)
  144. {
  145. fprintf(fp, "%f ", bottom_blob_scale[j]);
  146. }
  147. fprintf(fp, "\n");
  148. }
  149. fclose(fp);
  150. fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");
  151. return 0;
  152. }
  153. void QuantNet::print_quant_info() const
  154. {
  155. for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
  156. {
  157. const QuantBlobStat& stat = quant_blob_stats[i];
  158. float scale = 127 / stat.threshold;
  159. fprintf(stderr, "%-40s : max = %-15f threshold = %-15f scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
  160. }
  161. }
  162. /**
  163. * Read npy file
  164. * shape is input as [w,h,...]
  165. * @return ncnn::Mat
  166. */
  167. inline ncnn::Mat read_npy(const std::vector<int>& shape, const std::string& npypath)
  168. {
  169. npy::npy_data<float> d;
  170. try
  171. {
  172. d = npy::read_npy<float>(npypath);
  173. }
  174. catch (const std::exception& e)
  175. {
  176. fprintf(stderr, "npy::read_npy exception: %s\n", e.what());
  177. std::exit(EXIT_FAILURE);
  178. }
  179. std::vector<unsigned long> npy_shape = d.shape;
  180. size_t dims = shape.size();
  181. if (dims != npy_shape.size())
  182. {
  183. fprintf(stderr, "expect %d dims, but got: %d\n", (int)dims, (int)npy_shape.size());
  184. std::exit(EXIT_FAILURE);
  185. }
  186. for (size_t i = 0; i < dims; ++i)
  187. {
  188. if (static_cast<unsigned long>(shape[i]) != npy_shape[dims - 1 - i])
  189. {
  190. fprintf(stderr, "shape mismatch!\n");
  191. std::exit(EXIT_FAILURE);
  192. }
  193. }
  194. switch (dims)
  195. {
  196. case 1:
  197. return ncnn::Mat(shape[0], (void*)(d.data.data())).reshape(shape[0]).clone();
  198. case 2:
  199. return ncnn::Mat(shape[0] * shape[1], (void*)(d.data.data())).reshape(shape[0], shape[1]).clone();
  200. case 3:
  201. return ncnn::Mat(shape[0] * shape[1] * shape[2], (void*)(d.data.data())).reshape(shape[0], shape[1], shape[2]).clone();
  202. case 4:
  203. return ncnn::Mat(shape[0] * shape[1] * shape[2] * shape[3], (void*)(d.data.data())).reshape(shape[0], shape[1], shape[2], shape[3]).clone();
  204. default:
  205. fprintf(stderr, "dims:%d illegal!", (int)dims);
  206. return ncnn::Mat();
  207. }
  208. }
  209. /**
  210. * Read and resize image
  211. * shape is input as [w,h,...]
  212. * if w and h both are given, image will be resized to exactly size.
  213. * if w and h both are zero or negative, image will not be resized.
  214. * if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
  215. * if only w is zero or negative, image's height will scaled resize to h
  216. * @return ncnn::Mat
  217. */
  218. inline ncnn::Mat read_and_resize_image(const std::vector<int>& shape, const std::string& imagepath, int pixel_convert_type)
  219. {
  220. int target_w = shape[0];
  221. int target_h = shape[1];
  222. cv::Mat bgr = cv::imread(imagepath, 1);
  223. if (target_h <= 0 && target_w <= 0)
  224. {
  225. return ncnn::Mat::from_pixels(bgr.data, pixel_convert_type, bgr.cols, bgr.rows);
  226. }
  227. if (target_h <= 0 || target_w <= 0)
  228. {
  229. float scale = 1.0;
  230. if (target_h <= 0)
  231. {
  232. scale = 1.0 * bgr.cols / target_w;
  233. target_h = int(1.0 * bgr.rows / scale);
  234. }
  235. if (target_w <= 0)
  236. {
  237. scale = 1.0 * bgr.rows / target_h;
  238. target_w = int(1.0 * bgr.cols / scale);
  239. }
  240. }
  241. return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  242. }
  243. static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
  244. {
  245. const size_t length = a.size();
  246. float result = 0;
  247. for (size_t i = 0; i < length; i++)
  248. {
  249. result += a[i] * log(a[i] / b[i]);
  250. }
  251. return result;
  252. }
  253. int QuantNet::quantize_KL()
  254. {
  255. const int input_blob_count = (int)input_blobs.size();
  256. const int conv_layer_count = (int)conv_layers.size();
  257. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  258. const int file_count = (int)listspaths[0].size();
  259. const int num_histogram_bins = 2048;
  260. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  261. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  262. // initialize conv weight scales
  263. #pragma omp parallel for num_threads(quantize_num_threads)
  264. for (int i = 0; i < conv_layer_count; i++)
  265. {
  266. const ncnn::Layer* layer = layers[conv_layers[i]];
  267. if (layer->type == "Convolution")
  268. {
  269. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  270. const int num_output = convolution->num_output;
  271. const int kernel_w = convolution->kernel_w;
  272. const int kernel_h = convolution->kernel_h;
  273. const int dilation_w = convolution->dilation_w;
  274. const int dilation_h = convolution->dilation_h;
  275. const int stride_w = convolution->stride_w;
  276. const int stride_h = convolution->stride_h;
  277. const int weight_data_size_output = convolution->weight_data_size / num_output;
  278. // int8 winograd F43 needs weight data to use 6bit quantization
  279. // TODO proper condition for winograd 3x3 int8
  280. bool quant_6bit = false;
  281. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  282. quant_6bit = true;
  283. weight_scales[i].create(num_output);
  284. for (int n = 0; n < num_output; n++)
  285. {
  286. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  287. float absmax = 0.f;
  288. for (int k = 0; k < weight_data_size_output; k++)
  289. {
  290. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  291. }
  292. if (quant_6bit)
  293. {
  294. weight_scales[i][n] = 31 / absmax;
  295. }
  296. else
  297. {
  298. weight_scales[i][n] = 127 / absmax;
  299. }
  300. }
  301. }
  302. if (layer->type == "ConvolutionDepthWise")
  303. {
  304. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  305. const int group = convolutiondepthwise->group;
  306. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  307. std::vector<float> scales;
  308. weight_scales[i].create(group);
  309. for (int n = 0; n < group; n++)
  310. {
  311. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  312. float absmax = 0.f;
  313. for (int k = 0; k < weight_data_size_output; k++)
  314. {
  315. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  316. }
  317. weight_scales[i][n] = 127 / absmax;
  318. }
  319. }
  320. if (layer->type == "InnerProduct")
  321. {
  322. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  323. const int num_output = innerproduct->num_output;
  324. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  325. weight_scales[i].create(num_output);
  326. for (int n = 0; n < num_output; n++)
  327. {
  328. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  329. float absmax = 0.f;
  330. for (int k = 0; k < weight_data_size_output; k++)
  331. {
  332. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  333. }
  334. weight_scales[i][n] = 127 / absmax;
  335. }
  336. }
  337. }
  338. // count the absmax
  339. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  340. for (int i = 0; i < file_count; i++)
  341. {
  342. if (i % 100 == 0)
  343. {
  344. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / file_count, i, file_count);
  345. }
  346. ncnn::Extractor ex = create_extractor();
  347. ex.set_light_mode(true);
  348. const int thread_num = ncnn::get_omp_thread_num();
  349. ex.set_blob_allocator(&blob_allocators[thread_num]);
  350. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  351. for (int j = 0; j < input_blob_count; j++)
  352. {
  353. ncnn::Mat in;
  354. if (0 == file_type)
  355. {
  356. const int type_to_pixel = type_to_pixels[j];
  357. const std::vector<float>& mean_vals = means[j];
  358. const std::vector<float>& norm_vals = norms[j];
  359. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  360. if (type_to_pixel != pixel_convert_type)
  361. {
  362. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  363. }
  364. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  365. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  366. }
  367. else
  368. {
  369. in = read_npy(shapes[j], listspaths[j][i]);
  370. }
  371. ex.input(input_blobs[j], in);
  372. }
  373. for (int j = 0; j < conv_bottom_blob_count; j++)
  374. {
  375. ncnn::Mat out;
  376. ex.extract(conv_bottom_blobs[j], out);
  377. // count absmax
  378. {
  379. float absmax = 0.f;
  380. const int outc = out.c;
  381. const int outsize = out.w * out.h;
  382. for (int p = 0; p < outc; p++)
  383. {
  384. const float* ptr = out.channel(p);
  385. for (int k = 0; k < outsize; k++)
  386. {
  387. absmax = std::max(absmax, (float)fabs(ptr[k]));
  388. }
  389. }
  390. #pragma omp critical
  391. {
  392. QuantBlobStat& stat = quant_blob_stats[j];
  393. stat.absmax = std::max(stat.absmax, absmax);
  394. }
  395. }
  396. }
  397. }
  398. // initialize histogram
  399. #pragma omp parallel for num_threads(quantize_num_threads)
  400. for (int i = 0; i < conv_bottom_blob_count; i++)
  401. {
  402. QuantBlobStat& stat = quant_blob_stats[i];
  403. stat.histogram.resize(num_histogram_bins, 0);
  404. stat.histogram_normed.resize(num_histogram_bins, 0);
  405. }
  406. // build histogram
  407. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  408. for (int i = 0; i < file_count; i++)
  409. {
  410. if (i % 100 == 0)
  411. {
  412. fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / file_count, i, file_count);
  413. }
  414. ncnn::Extractor ex = create_extractor();
  415. ex.set_light_mode(true);
  416. const int thread_num = ncnn::get_omp_thread_num();
  417. ex.set_blob_allocator(&blob_allocators[thread_num]);
  418. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  419. for (int j = 0; j < input_blob_count; j++)
  420. {
  421. ncnn::Mat in;
  422. if (0 == file_type)
  423. {
  424. const int type_to_pixel = type_to_pixels[j];
  425. const std::vector<float>& mean_vals = means[j];
  426. const std::vector<float>& norm_vals = norms[j];
  427. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  428. if (type_to_pixel != pixel_convert_type)
  429. {
  430. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  431. }
  432. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  433. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  434. }
  435. else
  436. {
  437. in = read_npy(shapes[j], listspaths[j][i]);
  438. }
  439. ex.input(input_blobs[j], in);
  440. }
  441. for (int j = 0; j < conv_bottom_blob_count; j++)
  442. {
  443. ncnn::Mat out;
  444. ex.extract(conv_bottom_blobs[j], out);
  445. // count histogram bin
  446. {
  447. const float absmax = quant_blob_stats[j].absmax;
  448. std::vector<uint64_t> histogram(num_histogram_bins, 0);
  449. const int outc = out.c;
  450. const int outsize = out.w * out.h;
  451. for (int p = 0; p < outc; p++)
  452. {
  453. const float* ptr = out.channel(p);
  454. for (int k = 0; k < outsize; k++)
  455. {
  456. if (ptr[k] == 0.f)
  457. continue;
  458. const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));
  459. histogram[index] += 1;
  460. }
  461. }
  462. #pragma omp critical
  463. {
  464. QuantBlobStat& stat = quant_blob_stats[j];
  465. for (int k = 0; k < num_histogram_bins; k++)
  466. {
  467. stat.histogram[k] += histogram[k];
  468. }
  469. }
  470. }
  471. }
  472. }
  473. // using kld to find the best threshold value
  474. #pragma omp parallel for num_threads(quantize_num_threads)
  475. for (int i = 0; i < conv_bottom_blob_count; i++)
  476. {
  477. QuantBlobStat& stat = quant_blob_stats[i];
  478. // normalize histogram bin
  479. {
  480. uint64_t sum = 0;
  481. for (int j = 0; j < num_histogram_bins; j++)
  482. {
  483. sum += stat.histogram[j];
  484. }
  485. for (int j = 0; j < num_histogram_bins; j++)
  486. {
  487. stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
  488. }
  489. }
  490. const int target_bin = 128;
  491. int target_threshold = target_bin;
  492. float min_kl_divergence = FLT_MAX;
  493. for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
  494. {
  495. const float kl_eps = 0.0001f;
  496. std::vector<float> clip_distribution(threshold, kl_eps);
  497. {
  498. for (int j = 0; j < threshold; j++)
  499. {
  500. clip_distribution[j] += stat.histogram_normed[j];
  501. }
  502. for (int j = threshold; j < num_histogram_bins; j++)
  503. {
  504. clip_distribution[threshold - 1] += stat.histogram_normed[j];
  505. }
  506. }
  507. const float num_per_bin = (float)threshold / target_bin;
  508. std::vector<float> quantize_distribution(target_bin, 0.f);
  509. {
  510. {
  511. const float end = num_per_bin;
  512. const int right_lower = (int)floor(end);
  513. const float right_scale = end - right_lower;
  514. if (right_scale > 0)
  515. {
  516. quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
  517. }
  518. for (int k = 0; k < right_lower; k++)
  519. {
  520. quantize_distribution[0] += stat.histogram_normed[k];
  521. }
  522. quantize_distribution[0] /= right_lower + right_scale;
  523. }
  524. for (int j = 1; j < target_bin - 1; j++)
  525. {
  526. const float start = j * num_per_bin;
  527. const float end = (j + 1) * num_per_bin;
  528. const int left_upper = (int)ceil(start);
  529. const float left_scale = left_upper - start;
  530. const int right_lower = (int)floor(end);
  531. const float right_scale = end - right_lower;
  532. if (left_scale > 0)
  533. {
  534. quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
  535. }
  536. if (right_scale > 0)
  537. {
  538. quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
  539. }
  540. for (int k = left_upper; k < right_lower; k++)
  541. {
  542. quantize_distribution[j] += stat.histogram_normed[k];
  543. }
  544. quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
  545. }
  546. {
  547. const float start = threshold - num_per_bin;
  548. const int left_upper = (int)ceil(start);
  549. const float left_scale = left_upper - start;
  550. if (left_scale > 0)
  551. {
  552. quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
  553. }
  554. for (int k = left_upper; k < threshold; k++)
  555. {
  556. quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
  557. }
  558. quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
  559. }
  560. }
  561. std::vector<float> expand_distribution(threshold, kl_eps);
  562. {
  563. {
  564. const float end = num_per_bin;
  565. const int right_lower = (int)floor(end);
  566. const float right_scale = end - right_lower;
  567. if (right_scale > 0)
  568. {
  569. expand_distribution[right_lower] += right_scale * quantize_distribution[0];
  570. }
  571. for (int k = 0; k < right_lower; k++)
  572. {
  573. expand_distribution[k] += quantize_distribution[0];
  574. }
  575. }
  576. for (int j = 1; j < target_bin - 1; j++)
  577. {
  578. const float start = j * num_per_bin;
  579. const float end = (j + 1) * num_per_bin;
  580. const int left_upper = (int)ceil(start);
  581. const float left_scale = left_upper - start;
  582. const int right_lower = (int)floor(end);
  583. const float right_scale = end - right_lower;
  584. if (left_scale > 0)
  585. {
  586. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
  587. }
  588. if (right_scale > 0)
  589. {
  590. expand_distribution[right_lower] += right_scale * quantize_distribution[j];
  591. }
  592. for (int k = left_upper; k < right_lower; k++)
  593. {
  594. expand_distribution[k] += quantize_distribution[j];
  595. }
  596. }
  597. {
  598. const float start = threshold - num_per_bin;
  599. const int left_upper = (int)ceil(start);
  600. const float left_scale = left_upper - start;
  601. if (left_scale > 0)
  602. {
  603. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
  604. }
  605. for (int k = left_upper; k < threshold; k++)
  606. {
  607. expand_distribution[k] += quantize_distribution[target_bin - 1];
  608. }
  609. }
  610. }
  611. // kl
  612. const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);
  613. // the best num of bin
  614. if (kl_divergence < min_kl_divergence)
  615. {
  616. min_kl_divergence = kl_divergence;
  617. target_threshold = threshold;
  618. }
  619. }
  620. stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
  621. float scale = 127 / stat.threshold;
  622. bottom_blob_scales[i].create(1);
  623. bottom_blob_scales[i][0] = scale;
  624. }
  625. return 0;
  626. }
  627. static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
  628. {
  629. const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
  630. const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));
  631. double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));
  632. return (float)(alpha_gaussian[num_bits - 1] * std);
  633. }
  634. int QuantNet::quantize_ACIQ()
  635. {
  636. const int input_blob_count = (int)input_blobs.size();
  637. const int conv_layer_count = (int)conv_layers.size();
  638. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  639. const int file_count = (int)listspaths[0].size();
  640. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  641. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  642. // initialize conv weight scales
  643. #pragma omp parallel for num_threads(quantize_num_threads)
  644. for (int i = 0; i < conv_layer_count; i++)
  645. {
  646. const ncnn::Layer* layer = layers[conv_layers[i]];
  647. if (layer->type == "Convolution")
  648. {
  649. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  650. const int num_output = convolution->num_output;
  651. const int kernel_w = convolution->kernel_w;
  652. const int kernel_h = convolution->kernel_h;
  653. const int dilation_w = convolution->dilation_w;
  654. const int dilation_h = convolution->dilation_h;
  655. const int stride_w = convolution->stride_w;
  656. const int stride_h = convolution->stride_h;
  657. const int weight_data_size_output = convolution->weight_data_size / num_output;
  658. // int8 winograd F43 needs weight data to use 6bit quantization
  659. // TODO proper condition for winograd 3x3 int8
  660. bool quant_6bit = false;
  661. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  662. quant_6bit = true;
  663. weight_scales[i].create(num_output);
  664. for (int n = 0; n < num_output; n++)
  665. {
  666. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  667. float absmax = 0.f;
  668. for (int k = 0; k < weight_data_size_output; k++)
  669. {
  670. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  671. }
  672. if (quant_6bit)
  673. {
  674. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
  675. weight_scales[i][n] = 31 / threshold;
  676. }
  677. else
  678. {
  679. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  680. weight_scales[i][n] = 127 / threshold;
  681. }
  682. }
  683. }
  684. if (layer->type == "ConvolutionDepthWise")
  685. {
  686. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  687. const int group = convolutiondepthwise->group;
  688. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  689. std::vector<float> scales;
  690. weight_scales[i].create(group);
  691. for (int n = 0; n < group; n++)
  692. {
  693. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  694. float absmax = 0.f;
  695. for (int k = 0; k < weight_data_size_output; k++)
  696. {
  697. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  698. }
  699. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  700. weight_scales[i][n] = 127 / threshold;
  701. }
  702. }
  703. if (layer->type == "InnerProduct")
  704. {
  705. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  706. const int num_output = innerproduct->num_output;
  707. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  708. weight_scales[i].create(num_output);
  709. for (int n = 0; n < num_output; n++)
  710. {
  711. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  712. float absmax = 0.f;
  713. for (int k = 0; k < weight_data_size_output; k++)
  714. {
  715. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  716. }
  717. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  718. weight_scales[i][n] = 127 / threshold;
  719. }
  720. }
  721. }
  722. // count the absmax
  723. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  724. for (int i = 0; i < file_count; i++)
  725. {
  726. if (i % 100 == 0)
  727. {
  728. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / file_count, i, file_count);
  729. }
  730. ncnn::Extractor ex = create_extractor();
  731. ex.set_light_mode(true);
  732. const int thread_num = ncnn::get_omp_thread_num();
  733. ex.set_blob_allocator(&blob_allocators[thread_num]);
  734. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  735. for (int j = 0; j < input_blob_count; j++)
  736. {
  737. ncnn::Mat in;
  738. if (0 == file_type)
  739. {
  740. const int type_to_pixel = type_to_pixels[j];
  741. const std::vector<float>& mean_vals = means[j];
  742. const std::vector<float>& norm_vals = norms[j];
  743. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  744. if (type_to_pixel != pixel_convert_type)
  745. {
  746. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  747. }
  748. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  749. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  750. }
  751. else
  752. {
  753. in = read_npy(shapes[j], listspaths[j][i]);
  754. }
  755. ex.input(input_blobs[j], in);
  756. }
  757. for (int j = 0; j < conv_bottom_blob_count; j++)
  758. {
  759. ncnn::Mat out;
  760. ex.extract(conv_bottom_blobs[j], out);
  761. // count absmax
  762. {
  763. float absmax = 0.f;
  764. const int outc = out.c;
  765. const int outsize = out.w * out.h;
  766. for (int p = 0; p < outc; p++)
  767. {
  768. const float* ptr = out.channel(p);
  769. for (int k = 0; k < outsize; k++)
  770. {
  771. absmax = std::max(absmax, (float)fabs(ptr[k]));
  772. }
  773. }
  774. #pragma omp critical
  775. {
  776. QuantBlobStat& stat = quant_blob_stats[j];
  777. stat.absmax = std::max(stat.absmax, absmax);
  778. stat.total = outc * outsize;
  779. }
  780. }
  781. }
  782. }
  783. // alpha gaussian
  784. #pragma omp parallel for num_threads(quantize_num_threads)
  785. for (int i = 0; i < conv_bottom_blob_count; i++)
  786. {
  787. QuantBlobStat& stat = quant_blob_stats[i];
  788. stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
  789. float scale = 127 / stat.threshold;
  790. bottom_blob_scales[i].create(1);
  791. bottom_blob_scales[i][0] = scale;
  792. }
  793. return 0;
  794. }
  795. static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
  796. {
  797. const int chanenls = a.c;
  798. const int size = a.w * a.h;
  799. float sa = 0;
  800. float sb = 0;
  801. float sum = 0;
  802. for (int p = 0; p < chanenls; p++)
  803. {
  804. const float* pa = a.channel(p);
  805. const float* pb = b.channel(p);
  806. for (int i = 0; i < size; i++)
  807. {
  808. sa += pa[i] * pa[i];
  809. sb += pb[i] * pb[i];
  810. sum += pa[i] * pb[i];
  811. }
  812. }
  813. float sim = (float)sum / sqrt(sa) / sqrt(sb);
  814. return sim;
  815. }
  816. static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
  817. {
  818. if (layer->type == "Convolution")
  819. {
  820. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  821. pd.set(0, convolution->num_output);
  822. pd.set(1, convolution->kernel_w);
  823. pd.set(11, convolution->kernel_h);
  824. pd.set(2, convolution->dilation_w);
  825. pd.set(12, convolution->dilation_h);
  826. pd.set(3, convolution->stride_w);
  827. pd.set(13, convolution->stride_h);
  828. pd.set(4, convolution->pad_left);
  829. pd.set(15, convolution->pad_right);
  830. pd.set(14, convolution->pad_top);
  831. pd.set(16, convolution->pad_bottom);
  832. pd.set(18, convolution->pad_value);
  833. pd.set(5, convolution->bias_term);
  834. pd.set(6, convolution->weight_data_size);
  835. pd.set(8, convolution->int8_scale_term);
  836. pd.set(9, convolution->activation_type);
  837. pd.set(10, convolution->activation_params);
  838. }
  839. else if (layer->type == "ConvolutionDepthWise")
  840. {
  841. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  842. pd.set(0, convolutiondepthwise->num_output);
  843. pd.set(1, convolutiondepthwise->kernel_w);
  844. pd.set(11, convolutiondepthwise->kernel_h);
  845. pd.set(2, convolutiondepthwise->dilation_w);
  846. pd.set(12, convolutiondepthwise->dilation_h);
  847. pd.set(3, convolutiondepthwise->stride_w);
  848. pd.set(13, convolutiondepthwise->stride_h);
  849. pd.set(4, convolutiondepthwise->pad_left);
  850. pd.set(15, convolutiondepthwise->pad_right);
  851. pd.set(14, convolutiondepthwise->pad_top);
  852. pd.set(16, convolutiondepthwise->pad_bottom);
  853. pd.set(18, convolutiondepthwise->pad_value);
  854. pd.set(5, convolutiondepthwise->bias_term);
  855. pd.set(6, convolutiondepthwise->weight_data_size);
  856. pd.set(7, convolutiondepthwise->group);
  857. pd.set(8, convolutiondepthwise->int8_scale_term);
  858. pd.set(9, convolutiondepthwise->activation_type);
  859. pd.set(10, convolutiondepthwise->activation_params);
  860. }
  861. else if (layer->type == "InnerProduct")
  862. {
  863. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  864. pd.set(0, innerproduct->num_output);
  865. pd.set(1, innerproduct->bias_term);
  866. pd.set(2, innerproduct->weight_data_size);
  867. pd.set(8, innerproduct->int8_scale_term);
  868. pd.set(9, innerproduct->activation_type);
  869. pd.set(10, innerproduct->activation_params);
  870. }
  871. else
  872. {
  873. fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
  874. return -1;
  875. }
  876. return 0;
  877. }
  878. static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
  879. {
  880. if (layer->type == "Convolution")
  881. {
  882. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  883. weights.push_back(convolution->weight_data);
  884. if (convolution->bias_term)
  885. weights.push_back(convolution->bias_data);
  886. }
  887. else if (layer->type == "ConvolutionDepthWise")
  888. {
  889. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  890. weights.push_back(convolutiondepthwise->weight_data);
  891. if (convolutiondepthwise->bias_term)
  892. weights.push_back(convolutiondepthwise->bias_data);
  893. }
  894. else if (layer->type == "InnerProduct")
  895. {
  896. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  897. weights.push_back(innerproduct->weight_data);
  898. if (innerproduct->bias_term)
  899. weights.push_back(innerproduct->bias_data);
  900. }
  901. else
  902. {
  903. fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
  904. return -1;
  905. }
  906. return 0;
  907. }
  908. int QuantNet::quantize_EQ()
  909. {
  910. // find the initial scale via KL
  911. quantize_KL();
  912. print_quant_info();
  913. const int input_blob_count = (int)input_blobs.size();
  914. const int conv_layer_count = (int)conv_layers.size();
  915. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  916. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  917. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  918. // max 50 images for EQ
  919. const int file_count = std::min((int)listspaths[0].size(), 50);
  920. const float scale_range_lower = 0.5f;
  921. const float scale_range_upper = 2.0f;
  922. const int search_steps = 100;
  923. for (int i = 0; i < conv_layer_count; i++)
  924. {
  925. ncnn::Mat& weight_scale = weight_scales[i];
  926. ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  927. const ncnn::Layer* layer = layers[conv_layers[i]];
  928. // search weight scale
  929. for (int j = 0; j < weight_scale.w; j++)
  930. {
  931. const float scale = weight_scale[j];
  932. const float scale_lower = scale * scale_range_lower;
  933. const float scale_upper = scale * scale_range_upper;
  934. const float scale_step = (scale_upper - scale_lower) / search_steps;
  935. std::vector<double> avgsims(search_steps, 0.0);
  936. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  937. for (int ii = 0; ii < file_count; ii++)
  938. {
  939. if (ii % 100 == 0)
  940. {
  941. fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / file_count, ii, file_count, j, weight_scale.w, i, conv_layer_count);
  942. }
  943. ncnn::Extractor ex = create_extractor();
  944. ex.set_light_mode(true);
  945. const int thread_num = ncnn::get_omp_thread_num();
  946. ex.set_blob_allocator(&blob_allocators[thread_num]);
  947. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  948. for (int jj = 0; jj < input_blob_count; jj++)
  949. {
  950. ncnn::Mat in;
  951. if (0 == file_type)
  952. {
  953. const int type_to_pixel = type_to_pixels[j];
  954. const std::vector<float>& mean_vals = means[j];
  955. const std::vector<float>& norm_vals = norms[j];
  956. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  957. if (type_to_pixel != pixel_convert_type)
  958. {
  959. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  960. }
  961. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  962. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  963. }
  964. else
  965. {
  966. in = read_npy(shapes[j], listspaths[j][i]);
  967. }
  968. ex.input(input_blobs[j], in);
  969. }
  970. ncnn::Mat in;
  971. ex.extract(conv_bottom_blobs[i], in);
  972. ncnn::Mat out;
  973. ex.extract(conv_top_blobs[i], out);
  974. ncnn::Layer* layer_int8 = ncnn::create_layer_cpu(layer->typeindex);
  975. ncnn::ParamDict pd;
  976. get_layer_param(layer, pd);
  977. pd.set(8, 1); //int8_scale_term
  978. layer_int8->load_param(pd);
  979. std::vector<float> sims(search_steps);
  980. for (int k = 0; k < search_steps; k++)
  981. {
  982. ncnn::Mat new_weight_scale = weight_scale.clone();
  983. new_weight_scale[j] = scale_lower + k * scale_step;
  984. std::vector<ncnn::Mat> weights;
  985. get_layer_weights(layer, weights);
  986. weights.push_back(new_weight_scale);
  987. weights.push_back(bottom_blob_scale);
  988. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  989. ncnn::Option opt_int8;
  990. opt_int8.use_packing_layout = false;
  991. layer_int8->create_pipeline(opt_int8);
  992. ncnn::Mat out_int8;
  993. layer_int8->forward(in, out_int8, opt_int8);
  994. layer_int8->destroy_pipeline(opt_int8);
  995. sims[k] = cosine_similarity(out, out_int8);
  996. }
  997. delete layer_int8;
  998. #pragma omp critical
  999. {
  1000. for (int k = 0; k < search_steps; k++)
  1001. {
  1002. avgsims[k] += sims[k];
  1003. }
  1004. }
  1005. }
  1006. double max_avgsim = 0.0;
  1007. float new_scale = scale;
  1008. // find the scale with min cosine distance
  1009. for (int k = 0; k < search_steps; k++)
  1010. {
  1011. if (max_avgsim < avgsims[k])
  1012. {
  1013. max_avgsim = avgsims[k];
  1014. new_scale = scale_lower + k * scale_step;
  1015. }
  1016. }
  1017. fprintf(stderr, "%s w %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  1018. weight_scale[j] = new_scale;
  1019. }
  1020. // search bottom blob scale
  1021. for (int j = 0; j < bottom_blob_scale.w; j++)
  1022. {
  1023. const float scale = bottom_blob_scale[j];
  1024. const float scale_lower = scale * scale_range_lower;
  1025. const float scale_upper = scale * scale_range_upper;
  1026. const float scale_step = (scale_upper - scale_lower) / search_steps;
  1027. std::vector<double> avgsims(search_steps, 0.0);
  1028. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  1029. for (int ii = 0; ii < file_count; ii++)
  1030. {
  1031. if (ii % 100 == 0)
  1032. {
  1033. fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / file_count, ii, file_count, j, bottom_blob_scale.w, i, conv_layer_count);
  1034. }
  1035. ncnn::Extractor ex = create_extractor();
  1036. ex.set_light_mode(true);
  1037. const int thread_num = ncnn::get_omp_thread_num();
  1038. ex.set_blob_allocator(&blob_allocators[thread_num]);
  1039. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  1040. for (int jj = 0; jj < input_blob_count; jj++)
  1041. {
  1042. ncnn::Mat in;
  1043. if (0 == file_type)
  1044. {
  1045. const int type_to_pixel = type_to_pixels[j];
  1046. const std::vector<float>& mean_vals = means[j];
  1047. const std::vector<float>& norm_vals = norms[j];
  1048. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  1049. if (type_to_pixel != pixel_convert_type)
  1050. {
  1051. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  1052. }
  1053. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  1054. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  1055. }
  1056. else
  1057. {
  1058. in = read_npy(shapes[j], listspaths[j][i]);
  1059. }
  1060. ex.input(input_blobs[j], in);
  1061. }
  1062. ncnn::Mat in;
  1063. ex.extract(conv_bottom_blobs[i], in);
  1064. ncnn::Mat out;
  1065. ex.extract(conv_top_blobs[i], out);
  1066. ncnn::Layer* layer_int8 = ncnn::create_layer_cpu(layer->typeindex);
  1067. ncnn::ParamDict pd;
  1068. get_layer_param(layer, pd);
  1069. pd.set(8, 1); //int8_scale_term
  1070. layer_int8->load_param(pd);
  1071. std::vector<float> sims(search_steps);
  1072. for (int k = 0; k < search_steps; k++)
  1073. {
  1074. ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
  1075. new_bottom_blob_scale[j] = scale_lower + k * scale_step;
  1076. std::vector<ncnn::Mat> weights;
  1077. get_layer_weights(layer, weights);
  1078. weights.push_back(weight_scale);
  1079. weights.push_back(new_bottom_blob_scale);
  1080. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  1081. ncnn::Option opt_int8;
  1082. opt_int8.use_packing_layout = false;
  1083. layer_int8->create_pipeline(opt_int8);
  1084. ncnn::Mat out_int8;
  1085. layer_int8->forward(in, out_int8, opt_int8);
  1086. layer_int8->destroy_pipeline(opt_int8);
  1087. sims[k] = cosine_similarity(out, out_int8);
  1088. }
  1089. delete layer_int8;
  1090. #pragma omp critical
  1091. {
  1092. for (int k = 0; k < search_steps; k++)
  1093. {
  1094. avgsims[k] += sims[k];
  1095. }
  1096. }
  1097. }
  1098. double max_avgsim = 0.0;
  1099. float new_scale = scale;
  1100. // find the scale with min cosine distance
  1101. for (int k = 0; k < search_steps; k++)
  1102. {
  1103. if (max_avgsim < avgsims[k])
  1104. {
  1105. max_avgsim = avgsims[k];
  1106. new_scale = scale_lower + k * scale_step;
  1107. }
  1108. }
  1109. fprintf(stderr, "%s b %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  1110. bottom_blob_scale[j] = new_scale;
  1111. }
  1112. // update quant info
  1113. QuantBlobStat& stat = quant_blob_stats[i];
  1114. stat.threshold = 127 / bottom_blob_scale[0];
  1115. }
  1116. return 0;
  1117. }
  1118. static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
  1119. {
  1120. std::vector<std::vector<std::string> > aps;
  1121. char* pch = strtok(s, ",");
  1122. while (pch != NULL)
  1123. {
  1124. FILE* fp = fopen(pch, "rb");
  1125. if (!fp)
  1126. {
  1127. fprintf(stderr, "fopen %s failed\n", pch);
  1128. break;
  1129. }
  1130. std::vector<std::string> paths;
  1131. // one filepath per line
  1132. char line[1024];
  1133. while (!feof(fp))
  1134. {
  1135. char* ss = fgets(line, 1024, fp);
  1136. if (!ss)
  1137. break;
  1138. char filepath[256];
  1139. int nscan = sscanf(line, "%255s", filepath);
  1140. if (nscan != 1)
  1141. continue;
  1142. paths.push_back(std::string(filepath));
  1143. }
  1144. fclose(fp);
  1145. aps.push_back(paths);
  1146. pch = strtok(NULL, ",");
  1147. }
  1148. return aps;
  1149. }
  1150. static float vstr_to_float(const char vstr[20])
  1151. {
  1152. double v = 0.0;
  1153. const char* p = vstr;
  1154. // sign
  1155. bool sign = *p != '-';
  1156. if (*p == '+' || *p == '-')
  1157. {
  1158. p++;
  1159. }
  1160. // digits before decimal point or exponent
  1161. uint64_t v1 = 0;
  1162. while (isdigit(*p))
  1163. {
  1164. v1 = v1 * 10 + (*p - '0');
  1165. p++;
  1166. }
  1167. v = (double)v1;
  1168. // digits after decimal point
  1169. if (*p == '.')
  1170. {
  1171. p++;
  1172. uint64_t pow10 = 1;
  1173. uint64_t v2 = 0;
  1174. while (isdigit(*p))
  1175. {
  1176. v2 = v2 * 10 + (*p - '0');
  1177. pow10 *= 10;
  1178. p++;
  1179. }
  1180. v += v2 / (double)pow10;
  1181. }
  1182. // exponent
  1183. if (*p == 'e' || *p == 'E')
  1184. {
  1185. p++;
  1186. // sign of exponent
  1187. bool fact = *p != '-';
  1188. if (*p == '+' || *p == '-')
  1189. {
  1190. p++;
  1191. }
  1192. // digits of exponent
  1193. uint64_t expon = 0;
  1194. while (isdigit(*p))
  1195. {
  1196. expon = expon * 10 + (*p - '0');
  1197. p++;
  1198. }
  1199. double scale = 1.0;
  1200. while (expon >= 8)
  1201. {
  1202. scale *= 1e8;
  1203. expon -= 8;
  1204. }
  1205. while (expon > 0)
  1206. {
  1207. scale *= 10.0;
  1208. expon -= 1;
  1209. }
  1210. v = fact ? v * scale : v / scale;
  1211. }
  1212. // fprintf(stderr, "v = %f\n", v);
  1213. return sign ? (float)v : (float)-v;
  1214. }
  1215. static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
  1216. {
  1217. std::vector<std::vector<float> > aaf;
  1218. char* pch = strtok(s, "[]");
  1219. while (pch != NULL)
  1220. {
  1221. // parse a,b,c
  1222. char vstr[20];
  1223. int nconsumed = 0;
  1224. int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
  1225. if (nscan == 1)
  1226. {
  1227. // ok we get array
  1228. pch += nconsumed;
  1229. std::vector<float> af;
  1230. float v = vstr_to_float(vstr);
  1231. af.push_back(v);
  1232. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1233. while (nscan == 1)
  1234. {
  1235. pch += nconsumed;
  1236. float v = vstr_to_float(vstr);
  1237. af.push_back(v);
  1238. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1239. }
  1240. // array end
  1241. aaf.push_back(af);
  1242. }
  1243. pch = strtok(NULL, "[]");
  1244. }
  1245. return aaf;
  1246. }
  1247. static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
  1248. {
  1249. std::vector<std::vector<int> > aai;
  1250. char* pch = strtok(s, "[]");
  1251. while (pch != NULL)
  1252. {
  1253. // parse a,b,c
  1254. int v;
  1255. int nconsumed = 0;
  1256. int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
  1257. if (nscan == 1)
  1258. {
  1259. // ok we get array
  1260. pch += nconsumed;
  1261. std::vector<int> ai;
  1262. ai.push_back(v);
  1263. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1264. while (nscan == 1)
  1265. {
  1266. pch += nconsumed;
  1267. ai.push_back(v);
  1268. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1269. }
  1270. // array end
  1271. aai.push_back(ai);
  1272. }
  1273. pch = strtok(NULL, "[]");
  1274. }
  1275. return aai;
  1276. }
  1277. static std::vector<int> parse_comma_pixel_type_list(char* s)
  1278. {
  1279. std::vector<int> aps;
  1280. char* pch = strtok(s, ",");
  1281. while (pch != NULL)
  1282. {
  1283. // RAW/RGB/BGR/GRAY/RGBA/BGRA
  1284. if (strcmp(pch, "RAW") == 0)
  1285. aps.push_back(-233);
  1286. if (strcmp(pch, "RGB") == 0)
  1287. aps.push_back(ncnn::Mat::PIXEL_RGB);
  1288. if (strcmp(pch, "BGR") == 0)
  1289. aps.push_back(ncnn::Mat::PIXEL_BGR);
  1290. if (strcmp(pch, "GRAY") == 0)
  1291. aps.push_back(ncnn::Mat::PIXEL_GRAY);
  1292. if (strcmp(pch, "RGBA") == 0)
  1293. aps.push_back(ncnn::Mat::PIXEL_RGBA);
  1294. if (strcmp(pch, "BGRA") == 0)
  1295. aps.push_back(ncnn::Mat::PIXEL_BGRA);
  1296. pch = strtok(NULL, ",");
  1297. }
  1298. return aps;
  1299. }
  1300. static void print_float_array_list(const std::vector<std::vector<float> >& list)
  1301. {
  1302. for (size_t i = 0; i < list.size(); i++)
  1303. {
  1304. const std::vector<float>& array = list[i];
  1305. fprintf(stderr, "[");
  1306. for (size_t j = 0; j < array.size(); j++)
  1307. {
  1308. fprintf(stderr, "%f", array[j]);
  1309. if (j != array.size() - 1)
  1310. fprintf(stderr, ",");
  1311. }
  1312. fprintf(stderr, "]");
  1313. if (i != list.size() - 1)
  1314. fprintf(stderr, ",");
  1315. }
  1316. }
  1317. static void print_int_array_list(const std::vector<std::vector<int> >& list)
  1318. {
  1319. for (size_t i = 0; i < list.size(); i++)
  1320. {
  1321. const std::vector<int>& array = list[i];
  1322. fprintf(stderr, "[");
  1323. for (size_t j = 0; j < array.size(); j++)
  1324. {
  1325. fprintf(stderr, "%d", array[j]);
  1326. if (j != array.size() - 1)
  1327. fprintf(stderr, ",");
  1328. }
  1329. fprintf(stderr, "]");
  1330. if (i != list.size() - 1)
  1331. fprintf(stderr, ",");
  1332. }
  1333. }
  1334. static void print_pixel_type_list(const std::vector<int>& list)
  1335. {
  1336. for (size_t i = 0; i < list.size(); i++)
  1337. {
  1338. const int type = list[i];
  1339. if (type == -233)
  1340. fprintf(stderr, "RAW");
  1341. if (type == ncnn::Mat::PIXEL_RGB)
  1342. fprintf(stderr, "RGB");
  1343. if (type == ncnn::Mat::PIXEL_BGR)
  1344. fprintf(stderr, "BGR");
  1345. if (type == ncnn::Mat::PIXEL_GRAY)
  1346. fprintf(stderr, "GRAY");
  1347. if (type == ncnn::Mat::PIXEL_RGBA)
  1348. fprintf(stderr, "RGBA");
  1349. if (type == ncnn::Mat::PIXEL_BGRA)
  1350. fprintf(stderr, "BGRA");
  1351. if (i != list.size() - 1)
  1352. fprintf(stderr, ",");
  1353. }
  1354. }
  1355. static void show_usage()
  1356. {
  1357. fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
  1358. fprintf(stderr, " mean=[104.0,117.0,123.0],...\n");
  1359. fprintf(stderr, " norm=[1.0,1.0,1.0],...\n");
  1360. fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n");
  1361. fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
  1362. fprintf(stderr, " thread=8\n");
  1363. fprintf(stderr, " method=kl/aciq/eq\n");
  1364. fprintf(stderr, " type=0/1, 0:image,1:npy\n");
  1365. fprintf(stderr, "Sample usage:\n");
  1366. fprintf(stderr, " ncnn2table squeezenet.param squeezenet.bin filelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
  1367. fprintf(stderr, " ncnn2table test.param test.bin filelist.txt squeezenet.table shape=[227,227,3] method=kl type=1\n");
  1368. }
  1369. int main(int argc, char** argv)
  1370. {
  1371. if (argc < 5)
  1372. {
  1373. show_usage();
  1374. return -1;
  1375. }
  1376. for (int i = 1; i < argc; i++)
  1377. {
  1378. if (argv[i][0] == '-')
  1379. {
  1380. show_usage();
  1381. return -1;
  1382. }
  1383. }
  1384. const char* inparam = argv[1];
  1385. const char* inbin = argv[2];
  1386. char* lists = argv[3];
  1387. const char* outtable = argv[4];
  1388. ncnn::Option opt;
  1389. opt.num_threads = 1;
  1390. opt.lightmode = false;
  1391. opt.use_fp16_packed = false;
  1392. opt.use_fp16_storage = false;
  1393. opt.use_fp16_arithmetic = false;
  1394. QuantNet net;
  1395. net.opt = opt;
  1396. net.load_param(inparam);
  1397. net.load_model(inbin);
  1398. net.init();
  1399. // load lists
  1400. net.listspaths = parse_comma_path_list(lists);
  1401. std::string method = "kl";
  1402. net.file_type = 0;
  1403. for (int i = 5; i < argc; i++)
  1404. {
  1405. // key=value
  1406. char* kv = argv[i];
  1407. char* eqs = strchr(kv, '=');
  1408. if (eqs == NULL)
  1409. {
  1410. fprintf(stderr, "unrecognized arg %s\n", kv);
  1411. continue;
  1412. }
  1413. // split k v
  1414. eqs[0] = '\0';
  1415. const char* key = kv;
  1416. char* value = eqs + 1;
  1417. // load mean norm shape
  1418. if (memcmp(key, "mean", 4) == 0)
  1419. net.means = parse_comma_float_array_list(value);
  1420. if (memcmp(key, "norm", 4) == 0)
  1421. net.norms = parse_comma_float_array_list(value);
  1422. if (memcmp(key, "shape", 5) == 0)
  1423. net.shapes = parse_comma_int_array_list(value);
  1424. if (memcmp(key, "pixel", 5) == 0)
  1425. net.type_to_pixels = parse_comma_pixel_type_list(value);
  1426. if (memcmp(key, "thread", 6) == 0)
  1427. net.quantize_num_threads = atoi(value);
  1428. if (memcmp(key, "method", 6) == 0)
  1429. method = std::string(value);
  1430. if (memcmp(key, "type", 4) == 0)
  1431. net.file_type = atoi(value);
  1432. }
  1433. // sanity check
  1434. const size_t input_blob_count = net.input_blobs.size();
  1435. if (net.listspaths.size() != input_blob_count)
  1436. {
  1437. fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
  1438. return -1;
  1439. }
  1440. if ((0 == net.file_type) && (net.means.size() != input_blob_count))
  1441. {
  1442. fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
  1443. return -1;
  1444. }
  1445. if ((0 == net.file_type) && (net.norms.size() != input_blob_count))
  1446. {
  1447. fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
  1448. return -1;
  1449. }
  1450. if (net.shapes.size() != input_blob_count)
  1451. {
  1452. fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
  1453. return -1;
  1454. }
  1455. if ((0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count))
  1456. {
  1457. fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
  1458. return -1;
  1459. }
  1460. if (net.quantize_num_threads < 0)
  1461. {
  1462. fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
  1463. return -1;
  1464. }
  1465. // print quantnet config
  1466. {
  1467. fprintf(stderr, "mean = ");
  1468. print_float_array_list(net.means);
  1469. fprintf(stderr, "\n");
  1470. fprintf(stderr, "norm = ");
  1471. print_float_array_list(net.norms);
  1472. fprintf(stderr, "\n");
  1473. fprintf(stderr, "shape = ");
  1474. print_int_array_list(net.shapes);
  1475. fprintf(stderr, "\n");
  1476. fprintf(stderr, "pixel = ");
  1477. print_pixel_type_list(net.type_to_pixels);
  1478. fprintf(stderr, "\n");
  1479. fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
  1480. fprintf(stderr, "method = %s\n", method.c_str());
  1481. fprintf(stderr, "---------------------------------------\n");
  1482. }
  1483. if (method == "kl")
  1484. {
  1485. net.quantize_KL();
  1486. }
  1487. else if (method == "aciq")
  1488. {
  1489. net.quantize_ACIQ();
  1490. }
  1491. else if (method == "eq")
  1492. {
  1493. net.quantize_EQ();
  1494. }
  1495. else
  1496. {
  1497. fprintf(stderr, "not implemented yet !\n");
  1498. fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
  1499. return -1;
  1500. }
  1501. net.print_quant_info();
  1502. net.save_table(outtable);
  1503. return 0;
  1504. }