You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ncnn2table.cpp 56 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // author:BUG1989 (https://github.com/BUG1989/) Long-term support.
  4. // author:JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
  5. //
  6. // Copyright (C) 2019 BUG1989. All rights reserved.
  7. // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
  8. //
  9. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  10. // in compliance with the License. You may obtain a copy of the License at
  11. //
  12. // https://opensource.org/licenses/BSD-3-Clause
  13. //
  14. // Unless required by applicable law or agreed to in writing, software distributed
  15. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  16. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  17. // specific language governing permissions and limitations under the License.
  18. #ifdef _MSC_VER
  19. #define _CRT_SECURE_NO_DEPRECATE
  20. #endif
  21. #include <float.h>
  22. #include <limits.h>
  23. #include <math.h>
  24. #include <stdio.h>
  25. #include <stdint.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #if defined(USE_NCNN_SIMPLEOCV)
  29. #include "simpleocv.h"
  30. #elif defined(USE_LOCAL_IMREADWRITE)
  31. #include "imreadwrite.h"
  32. #else
  33. #include <opencv2/core/core.hpp>
  34. #include <opencv2/highgui/highgui.hpp>
  35. #endif
  36. #include <string>
  37. #include <vector>
  38. // ncnn public header
  39. #include "benchmark.h"
  40. #include "cpu.h"
  41. #include "net.h"
  42. // ncnn private header
  43. #include "layer/convolution.h"
  44. #include "layer/convolutiondepthwise.h"
  45. #include "layer/innerproduct.h"
  46. class QuantBlobStat
  47. {
  48. public:
  49. QuantBlobStat()
  50. {
  51. threshold = 0.f;
  52. absmax = 0.f;
  53. total = 0;
  54. }
  55. public:
  56. float threshold;
  57. float absmax;
  58. // ACIQ
  59. int total;
  60. // KL
  61. std::vector<uint64_t> histogram;
  62. std::vector<float> histogram_normed;
  63. };
  64. class QuantNet : public ncnn::Net
  65. {
  66. public:
  67. QuantNet();
  68. std::vector<ncnn::Blob>& blobs;
  69. std::vector<ncnn::Layer*>& layers;
  70. public:
  71. std::vector<std::vector<std::string> > listspaths;
  72. std::vector<std::vector<float> > means;
  73. std::vector<std::vector<float> > norms;
  74. std::vector<std::vector<int> > shapes;
  75. std::vector<int> type_to_pixels;
  76. int quantize_num_threads;
  77. public:
  78. int init();
  79. void print_quant_info() const;
  80. int save_table(const char* tablepath);
  81. int quantize_KL();
  82. int quantize_ACIQ();
  83. int quantize_EQ();
  84. public:
  85. std::vector<int> input_blobs;
  86. std::vector<int> conv_layers;
  87. std::vector<int> conv_bottom_blobs;
  88. std::vector<int> conv_top_blobs;
  89. // result
  90. std::vector<QuantBlobStat> quant_blob_stats;
  91. std::vector<ncnn::Mat> weight_scales;
  92. std::vector<ncnn::Mat> bottom_blob_scales;
  93. };
  94. QuantNet::QuantNet()
  95. : blobs(mutable_blobs()), layers(mutable_layers())
  96. {
  97. quantize_num_threads = ncnn::get_cpu_count();
  98. }
  99. int QuantNet::init()
  100. {
  101. // find all input layers
  102. for (int i = 0; i < (int)layers.size(); i++)
  103. {
  104. const ncnn::Layer* layer = layers[i];
  105. if (layer->type == "Input")
  106. {
  107. input_blobs.push_back(layer->tops[0]);
  108. }
  109. }
  110. // find all conv layers
  111. for (int i = 0; i < (int)layers.size(); i++)
  112. {
  113. const ncnn::Layer* layer = layers[i];
  114. if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct")
  115. {
  116. conv_layers.push_back(i);
  117. conv_bottom_blobs.push_back(layer->bottoms[0]);
  118. conv_top_blobs.push_back(layer->tops[0]);
  119. }
  120. }
  121. const int conv_layer_count = (int)conv_layers.size();
  122. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  123. quant_blob_stats.resize(conv_bottom_blob_count);
  124. weight_scales.resize(conv_layer_count);
  125. bottom_blob_scales.resize(conv_bottom_blob_count);
  126. return 0;
  127. }
  128. int QuantNet::save_table(const char* tablepath)
  129. {
  130. FILE* fp = fopen(tablepath, "wb");
  131. if (!fp)
  132. {
  133. fprintf(stderr, "fopen %s failed\n", tablepath);
  134. return -1;
  135. }
  136. const int conv_layer_count = (int)conv_layers.size();
  137. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  138. for (int i = 0; i < conv_layer_count; i++)
  139. {
  140. const ncnn::Mat& weight_scale = weight_scales[i];
  141. fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
  142. for (int j = 0; j < weight_scale.w; j++)
  143. {
  144. fprintf(fp, "%f ", weight_scale[j]);
  145. }
  146. fprintf(fp, "\n");
  147. }
  148. for (int i = 0; i < conv_bottom_blob_count; i++)
  149. {
  150. const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  151. fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
  152. for (int j = 0; j < bottom_blob_scale.w; j++)
  153. {
  154. fprintf(fp, "%f ", bottom_blob_scale[j]);
  155. }
  156. fprintf(fp, "\n");
  157. }
  158. fclose(fp);
  159. fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");
  160. return 0;
  161. }
  162. void QuantNet::print_quant_info() const
  163. {
  164. for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
  165. {
  166. const QuantBlobStat& stat = quant_blob_stats[i];
  167. float scale = 127 / stat.threshold;
  168. fprintf(stderr, "%-40s : max = %-15f threshold = %-15f scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
  169. }
  170. }
  171. /**
  172. * Read and resize image
  173. * shape is input as [w,h,...]
  174. * if w and h both are given, image will be resized to exactly size.
  175. * if w and h both are zero or negative, image will not be resized.
  176. * if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
  177. * if only w is zero or negative, image's height will scaled resize to h
  178. * @return ncnn::Mat
  179. */
  180. inline ncnn::Mat read_and_resize_image(const std::vector<int>& shape, const std::string& imagepath, int pixel_convert_type)
  181. {
  182. int target_w = shape[0];
  183. int target_h = shape[1];
  184. cv::Mat bgr = cv::imread(imagepath, 1);
  185. if (target_h <= 0 && target_w <= 0)
  186. {
  187. return ncnn::Mat::from_pixels(bgr.data, pixel_convert_type, bgr.cols, bgr.rows);
  188. }
  189. if (target_h <= 0 || target_w <= 0)
  190. {
  191. float scale = 1.0;
  192. if (target_h <= 0)
  193. {
  194. scale = 1.0 * bgr.cols / target_w;
  195. target_h = int(1.0 * bgr.rows / scale);
  196. }
  197. if (target_w <= 0)
  198. {
  199. scale = 1.0 * bgr.rows / target_h;
  200. target_w = int(1.0 * bgr.cols / scale);
  201. }
  202. }
  203. return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  204. }
  205. static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
  206. {
  207. const size_t length = a.size();
  208. float result = 0;
  209. for (size_t i = 0; i < length; i++)
  210. {
  211. result += a[i] * log(a[i] / b[i]);
  212. }
  213. return result;
  214. }
  215. int QuantNet::quantize_KL()
  216. {
  217. const int input_blob_count = (int)input_blobs.size();
  218. const int conv_layer_count = (int)conv_layers.size();
  219. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  220. const int image_count = (int)listspaths[0].size();
  221. const int num_histogram_bins = 2048;
  222. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  223. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  224. // initialize conv weight scales
  225. #pragma omp parallel for num_threads(quantize_num_threads)
  226. for (int i = 0; i < conv_layer_count; i++)
  227. {
  228. const ncnn::Layer* layer = layers[conv_layers[i]];
  229. if (layer->type == "Convolution")
  230. {
  231. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  232. const int num_output = convolution->num_output;
  233. const int kernel_w = convolution->kernel_w;
  234. const int kernel_h = convolution->kernel_h;
  235. const int dilation_w = convolution->dilation_w;
  236. const int dilation_h = convolution->dilation_h;
  237. const int stride_w = convolution->stride_w;
  238. const int stride_h = convolution->stride_h;
  239. const int weight_data_size_output = convolution->weight_data_size / num_output;
  240. // int8 winograd F43 needs weight data to use 6bit quantization
  241. // TODO proper condition for winograd 3x3 int8
  242. bool quant_6bit = false;
  243. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  244. quant_6bit = true;
  245. weight_scales[i].create(num_output);
  246. for (int n = 0; n < num_output; n++)
  247. {
  248. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  249. float absmax = 0.f;
  250. for (int k = 0; k < weight_data_size_output; k++)
  251. {
  252. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  253. }
  254. if (quant_6bit)
  255. {
  256. weight_scales[i][n] = 31 / absmax;
  257. }
  258. else
  259. {
  260. weight_scales[i][n] = 127 / absmax;
  261. }
  262. }
  263. }
  264. if (layer->type == "ConvolutionDepthWise")
  265. {
  266. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  267. const int group = convolutiondepthwise->group;
  268. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  269. std::vector<float> scales;
  270. weight_scales[i].create(group);
  271. for (int n = 0; n < group; n++)
  272. {
  273. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  274. float absmax = 0.f;
  275. for (int k = 0; k < weight_data_size_output; k++)
  276. {
  277. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  278. }
  279. weight_scales[i][n] = 127 / absmax;
  280. }
  281. }
  282. if (layer->type == "InnerProduct")
  283. {
  284. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  285. const int num_output = innerproduct->num_output;
  286. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  287. weight_scales[i].create(num_output);
  288. for (int n = 0; n < num_output; n++)
  289. {
  290. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  291. float absmax = 0.f;
  292. for (int k = 0; k < weight_data_size_output; k++)
  293. {
  294. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  295. }
  296. weight_scales[i][n] = 127 / absmax;
  297. }
  298. }
  299. }
  300. // count the absmax
  301. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  302. for (int i = 0; i < image_count; i++)
  303. {
  304. if (i % 100 == 0)
  305. {
  306. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  307. }
  308. ncnn::Extractor ex = create_extractor();
  309. const int thread_num = ncnn::get_omp_thread_num();
  310. ex.set_blob_allocator(&blob_allocators[thread_num]);
  311. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  312. for (int j = 0; j < input_blob_count; j++)
  313. {
  314. const int type_to_pixel = type_to_pixels[j];
  315. const std::vector<float>& mean_vals = means[j];
  316. const std::vector<float>& norm_vals = norms[j];
  317. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  318. if (type_to_pixel != pixel_convert_type)
  319. {
  320. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  321. }
  322. ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  323. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  324. ex.input(input_blobs[j], in);
  325. }
  326. for (int j = 0; j < conv_bottom_blob_count; j++)
  327. {
  328. ncnn::Mat out;
  329. ex.extract(conv_bottom_blobs[j], out);
  330. // count absmax
  331. {
  332. float absmax = 0.f;
  333. const int outc = out.c;
  334. const int outsize = out.w * out.h;
  335. for (int p = 0; p < outc; p++)
  336. {
  337. const float* ptr = out.channel(p);
  338. for (int k = 0; k < outsize; k++)
  339. {
  340. absmax = std::max(absmax, (float)fabs(ptr[k]));
  341. }
  342. }
  343. #pragma omp critical
  344. {
  345. QuantBlobStat& stat = quant_blob_stats[j];
  346. stat.absmax = std::max(stat.absmax, absmax);
  347. }
  348. }
  349. }
  350. }
  351. // initialize histogram
  352. #pragma omp parallel for num_threads(quantize_num_threads)
  353. for (int i = 0; i < conv_bottom_blob_count; i++)
  354. {
  355. QuantBlobStat& stat = quant_blob_stats[i];
  356. stat.histogram.resize(num_histogram_bins, 0);
  357. stat.histogram_normed.resize(num_histogram_bins, 0);
  358. }
  359. // build histogram
  360. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  361. for (int i = 0; i < image_count; i++)
  362. {
  363. if (i % 100 == 0)
  364. {
  365. fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  366. }
  367. ncnn::Extractor ex = create_extractor();
  368. const int thread_num = ncnn::get_omp_thread_num();
  369. ex.set_blob_allocator(&blob_allocators[thread_num]);
  370. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  371. for (int j = 0; j < input_blob_count; j++)
  372. {
  373. const int type_to_pixel = type_to_pixels[j];
  374. const std::vector<float>& mean_vals = means[j];
  375. const std::vector<float>& norm_vals = norms[j];
  376. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  377. if (type_to_pixel != pixel_convert_type)
  378. {
  379. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  380. }
  381. ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  382. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  383. ex.input(input_blobs[j], in);
  384. }
  385. for (int j = 0; j < conv_bottom_blob_count; j++)
  386. {
  387. ncnn::Mat out;
  388. ex.extract(conv_bottom_blobs[j], out);
  389. // count histogram bin
  390. {
  391. const float absmax = quant_blob_stats[j].absmax;
  392. std::vector<uint64_t> histogram(num_histogram_bins, 0);
  393. const int outc = out.c;
  394. const int outsize = out.w * out.h;
  395. for (int p = 0; p < outc; p++)
  396. {
  397. const float* ptr = out.channel(p);
  398. for (int k = 0; k < outsize; k++)
  399. {
  400. if (ptr[k] == 0.f)
  401. continue;
  402. const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));
  403. histogram[index] += 1;
  404. }
  405. }
  406. #pragma omp critical
  407. {
  408. QuantBlobStat& stat = quant_blob_stats[j];
  409. for (int k = 0; k < num_histogram_bins; k++)
  410. {
  411. stat.histogram[k] += histogram[k];
  412. }
  413. }
  414. }
  415. }
  416. }
  417. // using kld to find the best threshold value
  418. #pragma omp parallel for num_threads(quantize_num_threads)
  419. for (int i = 0; i < conv_bottom_blob_count; i++)
  420. {
  421. QuantBlobStat& stat = quant_blob_stats[i];
  422. // normalize histogram bin
  423. {
  424. uint64_t sum = 0;
  425. for (int j = 0; j < num_histogram_bins; j++)
  426. {
  427. sum += stat.histogram[j];
  428. }
  429. for (int j = 0; j < num_histogram_bins; j++)
  430. {
  431. stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
  432. }
  433. }
  434. const int target_bin = 128;
  435. int target_threshold = target_bin;
  436. float min_kl_divergence = FLT_MAX;
  437. for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
  438. {
  439. const float kl_eps = 0.0001f;
  440. std::vector<float> clip_distribution(threshold, kl_eps);
  441. {
  442. for (int j = 0; j < threshold; j++)
  443. {
  444. clip_distribution[j] += stat.histogram_normed[j];
  445. }
  446. for (int j = threshold; j < num_histogram_bins; j++)
  447. {
  448. clip_distribution[threshold - 1] += stat.histogram_normed[j];
  449. }
  450. }
  451. const float num_per_bin = (float)threshold / target_bin;
  452. std::vector<float> quantize_distribution(target_bin, 0.f);
  453. {
  454. {
  455. const float end = num_per_bin;
  456. const int right_lower = (int)floor(end);
  457. const float right_scale = end - right_lower;
  458. if (right_scale > 0)
  459. {
  460. quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
  461. }
  462. for (int k = 0; k < right_lower; k++)
  463. {
  464. quantize_distribution[0] += stat.histogram_normed[k];
  465. }
  466. quantize_distribution[0] /= right_lower + right_scale;
  467. }
  468. for (int j = 1; j < target_bin - 1; j++)
  469. {
  470. const float start = j * num_per_bin;
  471. const float end = (j + 1) * num_per_bin;
  472. const int left_upper = (int)ceil(start);
  473. const float left_scale = left_upper - start;
  474. const int right_lower = (int)floor(end);
  475. const float right_scale = end - right_lower;
  476. if (left_scale > 0)
  477. {
  478. quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
  479. }
  480. if (right_scale > 0)
  481. {
  482. quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
  483. }
  484. for (int k = left_upper; k < right_lower; k++)
  485. {
  486. quantize_distribution[j] += stat.histogram_normed[k];
  487. }
  488. quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
  489. }
  490. {
  491. const float start = threshold - num_per_bin;
  492. const int left_upper = (int)ceil(start);
  493. const float left_scale = left_upper - start;
  494. if (left_scale > 0)
  495. {
  496. quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
  497. }
  498. for (int k = left_upper; k < threshold; k++)
  499. {
  500. quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
  501. }
  502. quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
  503. }
  504. }
  505. std::vector<float> expand_distribution(threshold, kl_eps);
  506. {
  507. {
  508. const float end = num_per_bin;
  509. const int right_lower = (int)floor(end);
  510. const float right_scale = end - right_lower;
  511. if (right_scale > 0)
  512. {
  513. expand_distribution[right_lower] += right_scale * quantize_distribution[0];
  514. }
  515. for (int k = 0; k < right_lower; k++)
  516. {
  517. expand_distribution[k] += quantize_distribution[0];
  518. }
  519. }
  520. for (int j = 1; j < target_bin - 1; j++)
  521. {
  522. const float start = j * num_per_bin;
  523. const float end = (j + 1) * num_per_bin;
  524. const int left_upper = (int)ceil(start);
  525. const float left_scale = left_upper - start;
  526. const int right_lower = (int)floor(end);
  527. const float right_scale = end - right_lower;
  528. if (left_scale > 0)
  529. {
  530. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
  531. }
  532. if (right_scale > 0)
  533. {
  534. expand_distribution[right_lower] += right_scale * quantize_distribution[j];
  535. }
  536. for (int k = left_upper; k < right_lower; k++)
  537. {
  538. expand_distribution[k] += quantize_distribution[j];
  539. }
  540. }
  541. {
  542. const float start = threshold - num_per_bin;
  543. const int left_upper = (int)ceil(start);
  544. const float left_scale = left_upper - start;
  545. if (left_scale > 0)
  546. {
  547. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
  548. }
  549. for (int k = left_upper; k < threshold; k++)
  550. {
  551. expand_distribution[k] += quantize_distribution[target_bin - 1];
  552. }
  553. }
  554. }
  555. // kl
  556. const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);
  557. // the best num of bin
  558. if (kl_divergence < min_kl_divergence)
  559. {
  560. min_kl_divergence = kl_divergence;
  561. target_threshold = threshold;
  562. }
  563. }
  564. stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
  565. float scale = 127 / stat.threshold;
  566. bottom_blob_scales[i].create(1);
  567. bottom_blob_scales[i][0] = scale;
  568. }
  569. return 0;
  570. }
  571. static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
  572. {
  573. const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
  574. const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));
  575. double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));
  576. return (float)(alpha_gaussian[num_bits - 1] * std);
  577. }
  578. int QuantNet::quantize_ACIQ()
  579. {
  580. const int input_blob_count = (int)input_blobs.size();
  581. const int conv_layer_count = (int)conv_layers.size();
  582. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  583. const int image_count = (int)listspaths[0].size();
  584. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  585. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  586. // initialize conv weight scales
  587. #pragma omp parallel for num_threads(quantize_num_threads)
  588. for (int i = 0; i < conv_layer_count; i++)
  589. {
  590. const ncnn::Layer* layer = layers[conv_layers[i]];
  591. if (layer->type == "Convolution")
  592. {
  593. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  594. const int num_output = convolution->num_output;
  595. const int kernel_w = convolution->kernel_w;
  596. const int kernel_h = convolution->kernel_h;
  597. const int dilation_w = convolution->dilation_w;
  598. const int dilation_h = convolution->dilation_h;
  599. const int stride_w = convolution->stride_w;
  600. const int stride_h = convolution->stride_h;
  601. const int weight_data_size_output = convolution->weight_data_size / num_output;
  602. // int8 winograd F43 needs weight data to use 6bit quantization
  603. // TODO proper condition for winograd 3x3 int8
  604. bool quant_6bit = false;
  605. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  606. quant_6bit = true;
  607. weight_scales[i].create(num_output);
  608. for (int n = 0; n < num_output; n++)
  609. {
  610. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  611. float absmax = 0.f;
  612. for (int k = 0; k < weight_data_size_output; k++)
  613. {
  614. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  615. }
  616. if (quant_6bit)
  617. {
  618. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
  619. weight_scales[i][n] = 31 / threshold;
  620. }
  621. else
  622. {
  623. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  624. weight_scales[i][n] = 127 / threshold;
  625. }
  626. }
  627. }
  628. if (layer->type == "ConvolutionDepthWise")
  629. {
  630. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  631. const int group = convolutiondepthwise->group;
  632. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  633. std::vector<float> scales;
  634. weight_scales[i].create(group);
  635. for (int n = 0; n < group; n++)
  636. {
  637. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  638. float absmax = 0.f;
  639. for (int k = 0; k < weight_data_size_output; k++)
  640. {
  641. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  642. }
  643. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  644. weight_scales[i][n] = 127 / threshold;
  645. }
  646. }
  647. if (layer->type == "InnerProduct")
  648. {
  649. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  650. const int num_output = innerproduct->num_output;
  651. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  652. weight_scales[i].create(num_output);
  653. for (int n = 0; n < num_output; n++)
  654. {
  655. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  656. float absmax = 0.f;
  657. for (int k = 0; k < weight_data_size_output; k++)
  658. {
  659. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  660. }
  661. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  662. weight_scales[i][n] = 127 / threshold;
  663. }
  664. }
  665. }
  666. // count the absmax
  667. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  668. for (int i = 0; i < image_count; i++)
  669. {
  670. if (i % 100 == 0)
  671. {
  672. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  673. }
  674. ncnn::Extractor ex = create_extractor();
  675. const int thread_num = ncnn::get_omp_thread_num();
  676. ex.set_blob_allocator(&blob_allocators[thread_num]);
  677. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  678. for (int j = 0; j < input_blob_count; j++)
  679. {
  680. const int type_to_pixel = type_to_pixels[j];
  681. const std::vector<float>& mean_vals = means[j];
  682. const std::vector<float>& norm_vals = norms[j];
  683. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  684. if (type_to_pixel != pixel_convert_type)
  685. {
  686. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  687. }
  688. ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  689. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  690. ex.input(input_blobs[j], in);
  691. }
  692. for (int j = 0; j < conv_bottom_blob_count; j++)
  693. {
  694. ncnn::Mat out;
  695. ex.extract(conv_bottom_blobs[j], out);
  696. // count absmax
  697. {
  698. float absmax = 0.f;
  699. const int outc = out.c;
  700. const int outsize = out.w * out.h;
  701. for (int p = 0; p < outc; p++)
  702. {
  703. const float* ptr = out.channel(p);
  704. for (int k = 0; k < outsize; k++)
  705. {
  706. absmax = std::max(absmax, (float)fabs(ptr[k]));
  707. }
  708. }
  709. #pragma omp critical
  710. {
  711. QuantBlobStat& stat = quant_blob_stats[j];
  712. stat.absmax = std::max(stat.absmax, absmax);
  713. stat.total = outc * outsize;
  714. }
  715. }
  716. }
  717. }
  718. // alpha gaussian
  719. #pragma omp parallel for num_threads(quantize_num_threads)
  720. for (int i = 0; i < conv_bottom_blob_count; i++)
  721. {
  722. QuantBlobStat& stat = quant_blob_stats[i];
  723. stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
  724. float scale = 127 / stat.threshold;
  725. bottom_blob_scales[i].create(1);
  726. bottom_blob_scales[i][0] = scale;
  727. }
  728. return 0;
  729. }
  730. static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
  731. {
  732. const int chanenls = a.c;
  733. const int size = a.w * a.h;
  734. float sa = 0;
  735. float sb = 0;
  736. float sum = 0;
  737. for (int p = 0; p < chanenls; p++)
  738. {
  739. const float* pa = a.channel(p);
  740. const float* pb = b.channel(p);
  741. for (int i = 0; i < size; i++)
  742. {
  743. sa += pa[i] * pa[i];
  744. sb += pb[i] * pb[i];
  745. sum += pa[i] * pb[i];
  746. }
  747. }
  748. float sim = (float)sum / sqrt(sa) / sqrt(sb);
  749. return sim;
  750. }
  751. static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
  752. {
  753. if (layer->type == "Convolution")
  754. {
  755. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  756. pd.set(0, convolution->num_output);
  757. pd.set(1, convolution->kernel_w);
  758. pd.set(11, convolution->kernel_h);
  759. pd.set(2, convolution->dilation_w);
  760. pd.set(12, convolution->dilation_h);
  761. pd.set(3, convolution->stride_w);
  762. pd.set(13, convolution->stride_h);
  763. pd.set(4, convolution->pad_left);
  764. pd.set(15, convolution->pad_right);
  765. pd.set(14, convolution->pad_top);
  766. pd.set(16, convolution->pad_bottom);
  767. pd.set(18, convolution->pad_value);
  768. pd.set(5, convolution->bias_term);
  769. pd.set(6, convolution->weight_data_size);
  770. pd.set(8, convolution->int8_scale_term);
  771. pd.set(9, convolution->activation_type);
  772. pd.set(10, convolution->activation_params);
  773. }
  774. else if (layer->type == "ConvolutionDepthWise")
  775. {
  776. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  777. pd.set(0, convolutiondepthwise->num_output);
  778. pd.set(1, convolutiondepthwise->kernel_w);
  779. pd.set(11, convolutiondepthwise->kernel_h);
  780. pd.set(2, convolutiondepthwise->dilation_w);
  781. pd.set(12, convolutiondepthwise->dilation_h);
  782. pd.set(3, convolutiondepthwise->stride_w);
  783. pd.set(13, convolutiondepthwise->stride_h);
  784. pd.set(4, convolutiondepthwise->pad_left);
  785. pd.set(15, convolutiondepthwise->pad_right);
  786. pd.set(14, convolutiondepthwise->pad_top);
  787. pd.set(16, convolutiondepthwise->pad_bottom);
  788. pd.set(18, convolutiondepthwise->pad_value);
  789. pd.set(5, convolutiondepthwise->bias_term);
  790. pd.set(6, convolutiondepthwise->weight_data_size);
  791. pd.set(7, convolutiondepthwise->group);
  792. pd.set(8, convolutiondepthwise->int8_scale_term);
  793. pd.set(9, convolutiondepthwise->activation_type);
  794. pd.set(10, convolutiondepthwise->activation_params);
  795. }
  796. else if (layer->type == "InnerProduct")
  797. {
  798. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  799. pd.set(0, innerproduct->num_output);
  800. pd.set(1, innerproduct->bias_term);
  801. pd.set(2, innerproduct->weight_data_size);
  802. pd.set(8, innerproduct->int8_scale_term);
  803. pd.set(9, innerproduct->activation_type);
  804. pd.set(10, innerproduct->activation_params);
  805. }
  806. else
  807. {
  808. fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
  809. return -1;
  810. }
  811. return 0;
  812. }
  813. static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
  814. {
  815. if (layer->type == "Convolution")
  816. {
  817. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  818. weights.push_back(convolution->weight_data);
  819. if (convolution->bias_term)
  820. weights.push_back(convolution->bias_data);
  821. }
  822. else if (layer->type == "ConvolutionDepthWise")
  823. {
  824. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  825. weights.push_back(convolutiondepthwise->weight_data);
  826. if (convolutiondepthwise->bias_term)
  827. weights.push_back(convolutiondepthwise->bias_data);
  828. }
  829. else if (layer->type == "InnerProduct")
  830. {
  831. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  832. weights.push_back(innerproduct->weight_data);
  833. if (innerproduct->bias_term)
  834. weights.push_back(innerproduct->bias_data);
  835. }
  836. else
  837. {
  838. fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
  839. return -1;
  840. }
  841. return 0;
  842. }
  843. int QuantNet::quantize_EQ()
  844. {
  845. // find the initial scale via KL
  846. quantize_KL();
  847. print_quant_info();
  848. const int input_blob_count = (int)input_blobs.size();
  849. const int conv_layer_count = (int)conv_layers.size();
  850. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  851. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  852. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  853. // max 50 images for EQ
  854. const int image_count = std::min((int)listspaths[0].size(), 50);
  855. const float scale_range_lower = 0.5f;
  856. const float scale_range_upper = 2.0f;
  857. const int search_steps = 100;
  858. for (int i = 0; i < conv_layer_count; i++)
  859. {
  860. ncnn::Mat& weight_scale = weight_scales[i];
  861. ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  862. const ncnn::Layer* layer = layers[conv_layers[i]];
  863. // search weight scale
  864. for (int j = 0; j < weight_scale.w; j++)
  865. {
  866. const float scale = weight_scale[j];
  867. const float scale_lower = scale * scale_range_lower;
  868. const float scale_upper = scale * scale_range_upper;
  869. const float scale_step = (scale_upper - scale_lower) / search_steps;
  870. std::vector<double> avgsims(search_steps, 0.0);
  871. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  872. for (int ii = 0; ii < image_count; ii++)
  873. {
  874. if (ii % 100 == 0)
  875. {
  876. fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, weight_scale.w, i, conv_layer_count);
  877. }
  878. ncnn::Extractor ex = create_extractor();
  879. const int thread_num = ncnn::get_omp_thread_num();
  880. ex.set_blob_allocator(&blob_allocators[thread_num]);
  881. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  882. for (int jj = 0; jj < input_blob_count; jj++)
  883. {
  884. const int type_to_pixel = type_to_pixels[jj];
  885. const std::vector<float>& mean_vals = means[jj];
  886. const std::vector<float>& norm_vals = norms[jj];
  887. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  888. if (type_to_pixel != pixel_convert_type)
  889. {
  890. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  891. }
  892. ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
  893. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  894. ex.input(input_blobs[jj], in);
  895. }
  896. ncnn::Mat in;
  897. ex.extract(conv_bottom_blobs[i], in);
  898. ncnn::Mat out;
  899. ex.extract(conv_top_blobs[i], out);
  900. ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
  901. ncnn::ParamDict pd;
  902. get_layer_param(layer, pd);
  903. pd.set(8, 1); //int8_scale_term
  904. layer_int8->load_param(pd);
  905. std::vector<float> sims(search_steps);
  906. for (int k = 0; k < search_steps; k++)
  907. {
  908. ncnn::Mat new_weight_scale = weight_scale.clone();
  909. new_weight_scale[j] = scale_lower + k * scale_step;
  910. std::vector<ncnn::Mat> weights;
  911. get_layer_weights(layer, weights);
  912. weights.push_back(new_weight_scale);
  913. weights.push_back(bottom_blob_scale);
  914. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  915. ncnn::Option opt_int8;
  916. opt_int8.use_packing_layout = false;
  917. layer_int8->create_pipeline(opt_int8);
  918. ncnn::Mat out_int8;
  919. layer_int8->forward(in, out_int8, opt_int8);
  920. layer_int8->destroy_pipeline(opt_int8);
  921. sims[k] = cosine_similarity(out, out_int8);
  922. }
  923. delete layer_int8;
  924. #pragma omp critical
  925. {
  926. for (int k = 0; k < search_steps; k++)
  927. {
  928. avgsims[k] += sims[k];
  929. }
  930. }
  931. }
  932. double max_avgsim = 0.0;
  933. float new_scale = scale;
  934. // find the scale with min cosine distance
  935. for (int k = 0; k < search_steps; k++)
  936. {
  937. if (max_avgsim < avgsims[k])
  938. {
  939. max_avgsim = avgsims[k];
  940. new_scale = scale_lower + k * scale_step;
  941. }
  942. }
  943. fprintf(stderr, "%s w %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  944. weight_scale[j] = new_scale;
  945. }
  946. // search bottom blob scale
  947. for (int j = 0; j < bottom_blob_scale.w; j++)
  948. {
  949. const float scale = bottom_blob_scale[j];
  950. const float scale_lower = scale * scale_range_lower;
  951. const float scale_upper = scale * scale_range_upper;
  952. const float scale_step = (scale_upper - scale_lower) / search_steps;
  953. std::vector<double> avgsims(search_steps, 0.0);
  954. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  955. for (int ii = 0; ii < image_count; ii++)
  956. {
  957. if (ii % 100 == 0)
  958. {
  959. fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, bottom_blob_scale.w, i, conv_layer_count);
  960. }
  961. ncnn::Extractor ex = create_extractor();
  962. const int thread_num = ncnn::get_omp_thread_num();
  963. ex.set_blob_allocator(&blob_allocators[thread_num]);
  964. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  965. for (int jj = 0; jj < input_blob_count; jj++)
  966. {
  967. const int type_to_pixel = type_to_pixels[jj];
  968. const std::vector<float>& mean_vals = means[jj];
  969. const std::vector<float>& norm_vals = norms[jj];
  970. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  971. if (type_to_pixel != pixel_convert_type)
  972. {
  973. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  974. }
  975. ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
  976. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  977. ex.input(input_blobs[jj], in);
  978. }
  979. ncnn::Mat in;
  980. ex.extract(conv_bottom_blobs[i], in);
  981. ncnn::Mat out;
  982. ex.extract(conv_top_blobs[i], out);
  983. ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
  984. ncnn::ParamDict pd;
  985. get_layer_param(layer, pd);
  986. pd.set(8, 1); //int8_scale_term
  987. layer_int8->load_param(pd);
  988. std::vector<float> sims(search_steps);
  989. for (int k = 0; k < search_steps; k++)
  990. {
  991. ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
  992. new_bottom_blob_scale[j] = scale_lower + k * scale_step;
  993. std::vector<ncnn::Mat> weights;
  994. get_layer_weights(layer, weights);
  995. weights.push_back(weight_scale);
  996. weights.push_back(new_bottom_blob_scale);
  997. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  998. ncnn::Option opt_int8;
  999. opt_int8.use_packing_layout = false;
  1000. layer_int8->create_pipeline(opt_int8);
  1001. ncnn::Mat out_int8;
  1002. layer_int8->forward(in, out_int8, opt_int8);
  1003. layer_int8->destroy_pipeline(opt_int8);
  1004. sims[k] = cosine_similarity(out, out_int8);
  1005. }
  1006. delete layer_int8;
  1007. #pragma omp critical
  1008. {
  1009. for (int k = 0; k < search_steps; k++)
  1010. {
  1011. avgsims[k] += sims[k];
  1012. }
  1013. }
  1014. }
  1015. double max_avgsim = 0.0;
  1016. float new_scale = scale;
  1017. // find the scale with min cosine distance
  1018. for (int k = 0; k < search_steps; k++)
  1019. {
  1020. if (max_avgsim < avgsims[k])
  1021. {
  1022. max_avgsim = avgsims[k];
  1023. new_scale = scale_lower + k * scale_step;
  1024. }
  1025. }
  1026. fprintf(stderr, "%s b %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  1027. bottom_blob_scale[j] = new_scale;
  1028. }
  1029. // update quant info
  1030. QuantBlobStat& stat = quant_blob_stats[i];
  1031. stat.threshold = 127 / bottom_blob_scale[0];
  1032. }
  1033. return 0;
  1034. }
  1035. static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
  1036. {
  1037. std::vector<std::vector<std::string> > aps;
  1038. char* pch = strtok(s, ",");
  1039. while (pch != NULL)
  1040. {
  1041. FILE* fp = fopen(pch, "rb");
  1042. if (!fp)
  1043. {
  1044. fprintf(stderr, "fopen %s failed\n", pch);
  1045. break;
  1046. }
  1047. std::vector<std::string> paths;
  1048. // one filepath per line
  1049. char line[1024];
  1050. while (!feof(fp))
  1051. {
  1052. char* ss = fgets(line, 1024, fp);
  1053. if (!ss)
  1054. break;
  1055. char filepath[256];
  1056. int nscan = sscanf(line, "%255s", filepath);
  1057. if (nscan != 1)
  1058. continue;
  1059. paths.push_back(std::string(filepath));
  1060. }
  1061. fclose(fp);
  1062. aps.push_back(paths);
  1063. pch = strtok(NULL, ",");
  1064. }
  1065. return aps;
  1066. }
  1067. static float vstr_to_float(const char vstr[20])
  1068. {
  1069. double v = 0.0;
  1070. const char* p = vstr;
  1071. // sign
  1072. bool sign = *p != '-';
  1073. if (*p == '+' || *p == '-')
  1074. {
  1075. p++;
  1076. }
  1077. // digits before decimal point or exponent
  1078. uint64_t v1 = 0;
  1079. while (isdigit(*p))
  1080. {
  1081. v1 = v1 * 10 + (*p - '0');
  1082. p++;
  1083. }
  1084. v = (double)v1;
  1085. // digits after decimal point
  1086. if (*p == '.')
  1087. {
  1088. p++;
  1089. uint64_t pow10 = 1;
  1090. uint64_t v2 = 0;
  1091. while (isdigit(*p))
  1092. {
  1093. v2 = v2 * 10 + (*p - '0');
  1094. pow10 *= 10;
  1095. p++;
  1096. }
  1097. v += v2 / (double)pow10;
  1098. }
  1099. // exponent
  1100. if (*p == 'e' || *p == 'E')
  1101. {
  1102. p++;
  1103. // sign of exponent
  1104. bool fact = *p != '-';
  1105. if (*p == '+' || *p == '-')
  1106. {
  1107. p++;
  1108. }
  1109. // digits of exponent
  1110. uint64_t expon = 0;
  1111. while (isdigit(*p))
  1112. {
  1113. expon = expon * 10 + (*p - '0');
  1114. p++;
  1115. }
  1116. double scale = 1.0;
  1117. while (expon >= 8)
  1118. {
  1119. scale *= 1e8;
  1120. expon -= 8;
  1121. }
  1122. while (expon > 0)
  1123. {
  1124. scale *= 10.0;
  1125. expon -= 1;
  1126. }
  1127. v = fact ? v * scale : v / scale;
  1128. }
  1129. // fprintf(stderr, "v = %f\n", v);
  1130. return sign ? (float)v : (float)-v;
  1131. }
  1132. static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
  1133. {
  1134. std::vector<std::vector<float> > aaf;
  1135. char* pch = strtok(s, "[]");
  1136. while (pch != NULL)
  1137. {
  1138. // parse a,b,c
  1139. char vstr[20];
  1140. int nconsumed = 0;
  1141. int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
  1142. if (nscan == 1)
  1143. {
  1144. // ok we get array
  1145. pch += nconsumed;
  1146. std::vector<float> af;
  1147. float v = vstr_to_float(vstr);
  1148. af.push_back(v);
  1149. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1150. while (nscan == 1)
  1151. {
  1152. pch += nconsumed;
  1153. float v = vstr_to_float(vstr);
  1154. af.push_back(v);
  1155. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1156. }
  1157. // array end
  1158. aaf.push_back(af);
  1159. }
  1160. pch = strtok(NULL, "[]");
  1161. }
  1162. return aaf;
  1163. }
  1164. static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
  1165. {
  1166. std::vector<std::vector<int> > aai;
  1167. char* pch = strtok(s, "[]");
  1168. while (pch != NULL)
  1169. {
  1170. // parse a,b,c
  1171. int v;
  1172. int nconsumed = 0;
  1173. int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
  1174. if (nscan == 1)
  1175. {
  1176. // ok we get array
  1177. pch += nconsumed;
  1178. std::vector<int> ai;
  1179. ai.push_back(v);
  1180. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1181. while (nscan == 1)
  1182. {
  1183. pch += nconsumed;
  1184. ai.push_back(v);
  1185. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1186. }
  1187. // array end
  1188. aai.push_back(ai);
  1189. }
  1190. pch = strtok(NULL, "[]");
  1191. }
  1192. return aai;
  1193. }
  1194. static std::vector<int> parse_comma_pixel_type_list(char* s)
  1195. {
  1196. std::vector<int> aps;
  1197. char* pch = strtok(s, ",");
  1198. while (pch != NULL)
  1199. {
  1200. // RAW/RGB/BGR/GRAY/RGBA/BGRA
  1201. if (strcmp(pch, "RAW") == 0)
  1202. aps.push_back(-233);
  1203. if (strcmp(pch, "RGB") == 0)
  1204. aps.push_back(ncnn::Mat::PIXEL_RGB);
  1205. if (strcmp(pch, "BGR") == 0)
  1206. aps.push_back(ncnn::Mat::PIXEL_BGR);
  1207. if (strcmp(pch, "GRAY") == 0)
  1208. aps.push_back(ncnn::Mat::PIXEL_GRAY);
  1209. if (strcmp(pch, "RGBA") == 0)
  1210. aps.push_back(ncnn::Mat::PIXEL_RGBA);
  1211. if (strcmp(pch, "BGRA") == 0)
  1212. aps.push_back(ncnn::Mat::PIXEL_BGRA);
  1213. pch = strtok(NULL, ",");
  1214. }
  1215. return aps;
  1216. }
  1217. static void print_float_array_list(const std::vector<std::vector<float> >& list)
  1218. {
  1219. for (size_t i = 0; i < list.size(); i++)
  1220. {
  1221. const std::vector<float>& array = list[i];
  1222. fprintf(stderr, "[");
  1223. for (size_t j = 0; j < array.size(); j++)
  1224. {
  1225. fprintf(stderr, "%f", array[j]);
  1226. if (j != array.size() - 1)
  1227. fprintf(stderr, ",");
  1228. }
  1229. fprintf(stderr, "]");
  1230. if (i != list.size() - 1)
  1231. fprintf(stderr, ",");
  1232. }
  1233. }
  1234. static void print_int_array_list(const std::vector<std::vector<int> >& list)
  1235. {
  1236. for (size_t i = 0; i < list.size(); i++)
  1237. {
  1238. const std::vector<int>& array = list[i];
  1239. fprintf(stderr, "[");
  1240. for (size_t j = 0; j < array.size(); j++)
  1241. {
  1242. fprintf(stderr, "%d", array[j]);
  1243. if (j != array.size() - 1)
  1244. fprintf(stderr, ",");
  1245. }
  1246. fprintf(stderr, "]");
  1247. if (i != list.size() - 1)
  1248. fprintf(stderr, ",");
  1249. }
  1250. }
  1251. static void print_pixel_type_list(const std::vector<int>& list)
  1252. {
  1253. for (size_t i = 0; i < list.size(); i++)
  1254. {
  1255. const int type = list[i];
  1256. if (type == -233)
  1257. fprintf(stderr, "RAW");
  1258. if (type == ncnn::Mat::PIXEL_RGB)
  1259. fprintf(stderr, "RGB");
  1260. if (type == ncnn::Mat::PIXEL_BGR)
  1261. fprintf(stderr, "BGR");
  1262. if (type == ncnn::Mat::PIXEL_GRAY)
  1263. fprintf(stderr, "GRAY");
  1264. if (type == ncnn::Mat::PIXEL_RGBA)
  1265. fprintf(stderr, "RGBA");
  1266. if (type == ncnn::Mat::PIXEL_BGRA)
  1267. fprintf(stderr, "BGRA");
  1268. if (i != list.size() - 1)
  1269. fprintf(stderr, ",");
  1270. }
  1271. }
  1272. static void show_usage()
  1273. {
  1274. fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
  1275. fprintf(stderr, " mean=[104.0,117.0,123.0],...\n");
  1276. fprintf(stderr, " norm=[1.0,1.0,1.0],...\n");
  1277. fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n");
  1278. fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
  1279. fprintf(stderr, " thread=8\n");
  1280. fprintf(stderr, " method=kl/aciq/eq\n");
  1281. fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
  1282. }
  1283. int main(int argc, char** argv)
  1284. {
  1285. if (argc < 5)
  1286. {
  1287. show_usage();
  1288. return -1;
  1289. }
  1290. for (int i = 1; i < argc; i++)
  1291. {
  1292. if (argv[i][0] == '-')
  1293. {
  1294. show_usage();
  1295. return -1;
  1296. }
  1297. }
  1298. const char* inparam = argv[1];
  1299. const char* inbin = argv[2];
  1300. char* lists = argv[3];
  1301. const char* outtable = argv[4];
  1302. ncnn::Option opt;
  1303. opt.num_threads = 1;
  1304. opt.use_fp16_packed = false;
  1305. opt.use_fp16_storage = false;
  1306. opt.use_fp16_arithmetic = false;
  1307. QuantNet net;
  1308. net.opt = opt;
  1309. net.load_param(inparam);
  1310. net.load_model(inbin);
  1311. net.init();
  1312. // load lists
  1313. net.listspaths = parse_comma_path_list(lists);
  1314. std::string method = "kl";
  1315. for (int i = 5; i < argc; i++)
  1316. {
  1317. // key=value
  1318. char* kv = argv[i];
  1319. char* eqs = strchr(kv, '=');
  1320. if (eqs == NULL)
  1321. {
  1322. fprintf(stderr, "unrecognized arg %s\n", kv);
  1323. continue;
  1324. }
  1325. // split k v
  1326. eqs[0] = '\0';
  1327. const char* key = kv;
  1328. char* value = eqs + 1;
  1329. // load mean norm shape
  1330. if (memcmp(key, "mean", 4) == 0)
  1331. net.means = parse_comma_float_array_list(value);
  1332. if (memcmp(key, "norm", 4) == 0)
  1333. net.norms = parse_comma_float_array_list(value);
  1334. if (memcmp(key, "shape", 5) == 0)
  1335. net.shapes = parse_comma_int_array_list(value);
  1336. if (memcmp(key, "pixel", 5) == 0)
  1337. net.type_to_pixels = parse_comma_pixel_type_list(value);
  1338. if (memcmp(key, "thread", 6) == 0)
  1339. net.quantize_num_threads = atoi(value);
  1340. if (memcmp(key, "method", 6) == 0)
  1341. method = std::string(value);
  1342. }
  1343. // sanity check
  1344. const size_t input_blob_count = net.input_blobs.size();
  1345. if (net.listspaths.size() != input_blob_count)
  1346. {
  1347. fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
  1348. return -1;
  1349. }
  1350. if (net.means.size() != input_blob_count)
  1351. {
  1352. fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
  1353. return -1;
  1354. }
  1355. if (net.norms.size() != input_blob_count)
  1356. {
  1357. fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
  1358. return -1;
  1359. }
  1360. if (net.shapes.size() != input_blob_count)
  1361. {
  1362. fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
  1363. return -1;
  1364. }
  1365. if (net.type_to_pixels.size() != input_blob_count)
  1366. {
  1367. fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
  1368. return -1;
  1369. }
  1370. if (net.quantize_num_threads < 0)
  1371. {
  1372. fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
  1373. return -1;
  1374. }
  1375. // print quantnet config
  1376. {
  1377. fprintf(stderr, "mean = ");
  1378. print_float_array_list(net.means);
  1379. fprintf(stderr, "\n");
  1380. fprintf(stderr, "norm = ");
  1381. print_float_array_list(net.norms);
  1382. fprintf(stderr, "\n");
  1383. fprintf(stderr, "shape = ");
  1384. print_int_array_list(net.shapes);
  1385. fprintf(stderr, "\n");
  1386. fprintf(stderr, "pixel = ");
  1387. print_pixel_type_list(net.type_to_pixels);
  1388. fprintf(stderr, "\n");
  1389. fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
  1390. fprintf(stderr, "method = %s\n", method.c_str());
  1391. fprintf(stderr, "---------------------------------------\n");
  1392. }
  1393. if (method == "kl")
  1394. {
  1395. net.quantize_KL();
  1396. }
  1397. else if (method == "aciq")
  1398. {
  1399. net.quantize_ACIQ();
  1400. }
  1401. else if (method == "eq")
  1402. {
  1403. net.quantize_EQ();
  1404. }
  1405. else
  1406. {
  1407. fprintf(stderr, "not implemented yet !\n");
  1408. fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
  1409. return -1;
  1410. }
  1411. net.print_quant_info();
  1412. net.save_table(outtable);
  1413. return 0;
  1414. }