You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ncnn2table.cpp 56 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // author:BUG1989 (https://github.com/BUG1989/) Long-term support.
  4. // author:JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
  5. //
  6. // Copyright (C) 2019 BUG1989. All rights reserved.
  7. // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
  8. //
  9. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  10. // in compliance with the License. You may obtain a copy of the License at
  11. //
  12. // https://opensource.org/licenses/BSD-3-Clause
  13. //
  14. // Unless required by applicable law or agreed to in writing, software distributed
  15. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  16. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  17. // specific language governing permissions and limitations under the License.
  18. #ifdef _MSC_VER
  19. #define _CRT_SECURE_NO_DEPRECATE
  20. #endif
  21. #include <float.h>
  22. #include <limits.h>
  23. #include <math.h>
  24. #include <stdio.h>
  25. #include <stdint.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #if defined(USE_NCNN_SIMPLEOCV)
  29. #include "simpleocv.h"
  30. #elif defined(USE_LOCAL_IMREADWRITE)
  31. #include "imreadwrite.h"
  32. #else
  33. #include <opencv2/core/core.hpp>
  34. #include <opencv2/highgui/highgui.hpp>
  35. #endif
  36. #include <string>
  37. #include <vector>
  38. // ncnn public header
  39. #include "benchmark.h"
  40. #include "cpu.h"
  41. #include "net.h"
  42. // ncnn private header
  43. #include "layer/convolution.h"
  44. #include "layer/convolutiondepthwise.h"
  45. #include "layer/innerproduct.h"
  46. class QuantBlobStat
  47. {
  48. public:
  49. QuantBlobStat()
  50. {
  51. threshold = 0.f;
  52. absmax = 0.f;
  53. total = 0;
  54. }
  55. public:
  56. float threshold;
  57. float absmax;
  58. // ACIQ
  59. int total;
  60. // KL
  61. std::vector<uint64_t> histogram;
  62. std::vector<float> histogram_normed;
  63. };
  64. class QuantNet : public ncnn::Net
  65. {
  66. public:
  67. QuantNet();
  68. std::vector<ncnn::Blob>& blobs;
  69. std::vector<ncnn::Layer*>& layers;
  70. public:
  71. std::vector<std::vector<std::string> > listspaths;
  72. std::vector<std::vector<float> > means;
  73. std::vector<std::vector<float> > norms;
  74. std::vector<std::vector<int> > shapes;
  75. std::vector<int> type_to_pixels;
  76. int quantize_num_threads;
  77. public:
  78. int init();
  79. void print_quant_info() const;
  80. int save_table(const char* tablepath);
  81. int quantize_KL();
  82. int quantize_ACIQ();
  83. int quantize_EQ();
  84. public:
  85. std::vector<int> input_blobs;
  86. std::vector<int> conv_layers;
  87. std::vector<int> conv_bottom_blobs;
  88. std::vector<int> conv_top_blobs;
  89. // result
  90. std::vector<QuantBlobStat> quant_blob_stats;
  91. std::vector<ncnn::Mat> weight_scales;
  92. std::vector<ncnn::Mat> bottom_blob_scales;
  93. };
  94. QuantNet::QuantNet()
  95. : blobs(mutable_blobs()), layers(mutable_layers())
  96. {
  97. quantize_num_threads = ncnn::get_cpu_count();
  98. }
  99. int QuantNet::init()
  100. {
  101. // find all input layers
  102. for (int i = 0; i < (int)layers.size(); i++)
  103. {
  104. const ncnn::Layer* layer = layers[i];
  105. if (layer->type == "Input")
  106. {
  107. input_blobs.push_back(layer->tops[0]);
  108. }
  109. }
  110. // find all conv layers
  111. for (int i = 0; i < (int)layers.size(); i++)
  112. {
  113. const ncnn::Layer* layer = layers[i];
  114. if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct")
  115. {
  116. conv_layers.push_back(i);
  117. conv_bottom_blobs.push_back(layer->bottoms[0]);
  118. conv_top_blobs.push_back(layer->tops[0]);
  119. }
  120. }
  121. const int conv_layer_count = (int)conv_layers.size();
  122. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  123. quant_blob_stats.resize(conv_bottom_blob_count);
  124. weight_scales.resize(conv_layer_count);
  125. bottom_blob_scales.resize(conv_bottom_blob_count);
  126. return 0;
  127. }
  128. int QuantNet::save_table(const char* tablepath)
  129. {
  130. FILE* fp = fopen(tablepath, "wb");
  131. if (!fp)
  132. {
  133. fprintf(stderr, "fopen %s failed\n", tablepath);
  134. return -1;
  135. }
  136. const int conv_layer_count = (int)conv_layers.size();
  137. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  138. for (int i = 0; i < conv_layer_count; i++)
  139. {
  140. const ncnn::Mat& weight_scale = weight_scales[i];
  141. fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
  142. for (int j = 0; j < weight_scale.w; j++)
  143. {
  144. fprintf(fp, "%f ", weight_scale[j]);
  145. }
  146. fprintf(fp, "\n");
  147. }
  148. for (int i = 0; i < conv_bottom_blob_count; i++)
  149. {
  150. const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  151. fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
  152. for (int j = 0; j < bottom_blob_scale.w; j++)
  153. {
  154. fprintf(fp, "%f ", bottom_blob_scale[j]);
  155. }
  156. fprintf(fp, "\n");
  157. }
  158. fclose(fp);
  159. fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");
  160. return 0;
  161. }
  162. void QuantNet::print_quant_info() const
  163. {
  164. for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
  165. {
  166. const QuantBlobStat& stat = quant_blob_stats[i];
  167. float scale = 127 / stat.threshold;
  168. fprintf(stderr, "%-40s : max = %-15f threshold = %-15f scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
  169. }
  170. }
  171. static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
  172. {
  173. const size_t length = a.size();
  174. float result = 0;
  175. for (size_t i = 0; i < length; i++)
  176. {
  177. result += a[i] * log(a[i] / b[i]);
  178. }
  179. return result;
  180. }
  181. int QuantNet::quantize_KL()
  182. {
  183. const int input_blob_count = (int)input_blobs.size();
  184. const int conv_layer_count = (int)conv_layers.size();
  185. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  186. const int image_count = (int)listspaths[0].size();
  187. const int num_histogram_bins = 2048;
  188. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  189. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  190. // initialize conv weight scales
  191. #pragma omp parallel for num_threads(quantize_num_threads)
  192. for (int i = 0; i < conv_layer_count; i++)
  193. {
  194. const ncnn::Layer* layer = layers[conv_layers[i]];
  195. if (layer->type == "Convolution")
  196. {
  197. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  198. const int num_output = convolution->num_output;
  199. const int kernel_w = convolution->kernel_w;
  200. const int kernel_h = convolution->kernel_h;
  201. const int dilation_w = convolution->dilation_w;
  202. const int dilation_h = convolution->dilation_h;
  203. const int stride_w = convolution->stride_w;
  204. const int stride_h = convolution->stride_h;
  205. const int weight_data_size_output = convolution->weight_data_size / num_output;
  206. // int8 winograd F43 needs weight data to use 6bit quantization
  207. // TODO proper condition for winograd 3x3 int8
  208. bool quant_6bit = false;
  209. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  210. quant_6bit = true;
  211. weight_scales[i].create(num_output);
  212. for (int n = 0; n < num_output; n++)
  213. {
  214. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  215. float absmax = 0.f;
  216. for (int k = 0; k < weight_data_size_output; k++)
  217. {
  218. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  219. }
  220. if (quant_6bit)
  221. {
  222. weight_scales[i][n] = 31 / absmax;
  223. }
  224. else
  225. {
  226. weight_scales[i][n] = 127 / absmax;
  227. }
  228. }
  229. }
  230. if (layer->type == "ConvolutionDepthWise")
  231. {
  232. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  233. const int group = convolutiondepthwise->group;
  234. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  235. std::vector<float> scales;
  236. weight_scales[i].create(group);
  237. for (int n = 0; n < group; n++)
  238. {
  239. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  240. float absmax = 0.f;
  241. for (int k = 0; k < weight_data_size_output; k++)
  242. {
  243. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  244. }
  245. weight_scales[i][n] = 127 / absmax;
  246. }
  247. }
  248. if (layer->type == "InnerProduct")
  249. {
  250. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  251. const int num_output = innerproduct->num_output;
  252. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  253. weight_scales[i].create(num_output);
  254. for (int n = 0; n < num_output; n++)
  255. {
  256. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  257. float absmax = 0.f;
  258. for (int k = 0; k < weight_data_size_output; k++)
  259. {
  260. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  261. }
  262. weight_scales[i][n] = 127 / absmax;
  263. }
  264. }
  265. }
  266. // count the absmax
  267. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  268. for (int i = 0; i < image_count; i++)
  269. {
  270. if (i % 100 == 0)
  271. {
  272. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  273. }
  274. ncnn::Extractor ex = create_extractor();
  275. const int thread_num = ncnn::get_omp_thread_num();
  276. ex.set_blob_allocator(&blob_allocators[thread_num]);
  277. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  278. for (int j = 0; j < input_blob_count; j++)
  279. {
  280. const std::string& imagepath = listspaths[j][i];
  281. const std::vector<int>& shape = shapes[j];
  282. const int type_to_pixel = type_to_pixels[j];
  283. const std::vector<float>& mean_vals = means[j];
  284. const std::vector<float>& norm_vals = norms[j];
  285. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  286. if (type_to_pixel != pixel_convert_type)
  287. {
  288. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  289. }
  290. const int target_w = shape[0];
  291. const int target_h = shape[1];
  292. cv::Mat bgr = cv::imread(imagepath, 1);
  293. ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  294. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  295. ex.input(input_blobs[j], in);
  296. }
  297. for (int j = 0; j < conv_bottom_blob_count; j++)
  298. {
  299. ncnn::Mat out;
  300. ex.extract(conv_bottom_blobs[j], out);
  301. // count absmax
  302. {
  303. float absmax = 0.f;
  304. const int outc = out.c;
  305. const int outsize = out.w * out.h;
  306. for (int p = 0; p < outc; p++)
  307. {
  308. const float* ptr = out.channel(p);
  309. for (int k = 0; k < outsize; k++)
  310. {
  311. absmax = std::max(absmax, (float)fabs(ptr[k]));
  312. }
  313. }
  314. #pragma omp critical
  315. {
  316. QuantBlobStat& stat = quant_blob_stats[j];
  317. stat.absmax = std::max(stat.absmax, absmax);
  318. }
  319. }
  320. }
  321. }
  322. // initialize histogram
  323. #pragma omp parallel for num_threads(quantize_num_threads)
  324. for (int i = 0; i < conv_bottom_blob_count; i++)
  325. {
  326. QuantBlobStat& stat = quant_blob_stats[i];
  327. stat.histogram.resize(num_histogram_bins, 0);
  328. stat.histogram_normed.resize(num_histogram_bins, 0);
  329. }
  330. // build histogram
  331. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  332. for (int i = 0; i < image_count; i++)
  333. {
  334. if (i % 100 == 0)
  335. {
  336. fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  337. }
  338. ncnn::Extractor ex = create_extractor();
  339. const int thread_num = ncnn::get_omp_thread_num();
  340. ex.set_blob_allocator(&blob_allocators[thread_num]);
  341. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  342. for (int j = 0; j < input_blob_count; j++)
  343. {
  344. const std::string& imagepath = listspaths[j][i];
  345. const std::vector<int>& shape = shapes[j];
  346. const int type_to_pixel = type_to_pixels[j];
  347. const std::vector<float>& mean_vals = means[j];
  348. const std::vector<float>& norm_vals = norms[j];
  349. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  350. if (type_to_pixel != pixel_convert_type)
  351. {
  352. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  353. }
  354. const int target_w = shape[0];
  355. const int target_h = shape[1];
  356. cv::Mat bgr = cv::imread(imagepath, 1);
  357. ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  358. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  359. ex.input(input_blobs[j], in);
  360. }
  361. for (int j = 0; j < conv_bottom_blob_count; j++)
  362. {
  363. ncnn::Mat out;
  364. ex.extract(conv_bottom_blobs[j], out);
  365. // count histogram bin
  366. {
  367. const float absmax = quant_blob_stats[j].absmax;
  368. std::vector<uint64_t> histogram(num_histogram_bins, 0);
  369. const int outc = out.c;
  370. const int outsize = out.w * out.h;
  371. for (int p = 0; p < outc; p++)
  372. {
  373. const float* ptr = out.channel(p);
  374. for (int k = 0; k < outsize; k++)
  375. {
  376. if (ptr[k] == 0.f)
  377. continue;
  378. const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));
  379. histogram[index] += 1;
  380. }
  381. }
  382. #pragma omp critical
  383. {
  384. QuantBlobStat& stat = quant_blob_stats[j];
  385. for (int k = 0; k < num_histogram_bins; k++)
  386. {
  387. stat.histogram[k] += histogram[k];
  388. }
  389. }
  390. }
  391. }
  392. }
  393. // using kld to find the best threshold value
  394. #pragma omp parallel for num_threads(quantize_num_threads)
  395. for (int i = 0; i < conv_bottom_blob_count; i++)
  396. {
  397. QuantBlobStat& stat = quant_blob_stats[i];
  398. // normalize histogram bin
  399. {
  400. uint64_t sum = 0;
  401. for (int j = 0; j < num_histogram_bins; j++)
  402. {
  403. sum += stat.histogram[j];
  404. }
  405. for (int j = 0; j < num_histogram_bins; j++)
  406. {
  407. stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
  408. }
  409. }
  410. const int target_bin = 128;
  411. int target_threshold = target_bin;
  412. float min_kl_divergence = FLT_MAX;
  413. for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
  414. {
  415. const float kl_eps = 0.0001f;
  416. std::vector<float> clip_distribution(threshold, kl_eps);
  417. {
  418. for (int j = 0; j < threshold; j++)
  419. {
  420. clip_distribution[j] += stat.histogram_normed[j];
  421. }
  422. for (int j = threshold; j < num_histogram_bins; j++)
  423. {
  424. clip_distribution[threshold - 1] += stat.histogram_normed[j];
  425. }
  426. }
  427. const float num_per_bin = (float)threshold / target_bin;
  428. std::vector<float> quantize_distribution(target_bin, 0.f);
  429. {
  430. {
  431. const float end = num_per_bin;
  432. const int right_lower = (int)floor(end);
  433. const float right_scale = end - right_lower;
  434. if (right_scale > 0)
  435. {
  436. quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
  437. }
  438. for (int k = 0; k < right_lower; k++)
  439. {
  440. quantize_distribution[0] += stat.histogram_normed[k];
  441. }
  442. quantize_distribution[0] /= right_lower + right_scale;
  443. }
  444. for (int j = 1; j < target_bin - 1; j++)
  445. {
  446. const float start = j * num_per_bin;
  447. const float end = (j + 1) * num_per_bin;
  448. const int left_upper = (int)ceil(start);
  449. const float left_scale = left_upper - start;
  450. const int right_lower = (int)floor(end);
  451. const float right_scale = end - right_lower;
  452. if (left_scale > 0)
  453. {
  454. quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
  455. }
  456. if (right_scale > 0)
  457. {
  458. quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
  459. }
  460. for (int k = left_upper; k < right_lower; k++)
  461. {
  462. quantize_distribution[j] += stat.histogram_normed[k];
  463. }
  464. quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
  465. }
  466. {
  467. const float start = threshold - num_per_bin;
  468. const int left_upper = (int)ceil(start);
  469. const float left_scale = left_upper - start;
  470. if (left_scale > 0)
  471. {
  472. quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
  473. }
  474. for (int k = left_upper; k < threshold; k++)
  475. {
  476. quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
  477. }
  478. quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
  479. }
  480. }
  481. std::vector<float> expand_distribution(threshold, kl_eps);
  482. {
  483. {
  484. const float end = num_per_bin;
  485. const int right_lower = (int)floor(end);
  486. const float right_scale = end - right_lower;
  487. if (right_scale > 0)
  488. {
  489. expand_distribution[right_lower] += right_scale * quantize_distribution[0];
  490. }
  491. for (int k = 0; k < right_lower; k++)
  492. {
  493. expand_distribution[k] += quantize_distribution[0];
  494. }
  495. }
  496. for (int j = 1; j < target_bin - 1; j++)
  497. {
  498. const float start = j * num_per_bin;
  499. const float end = (j + 1) * num_per_bin;
  500. const int left_upper = (int)ceil(start);
  501. const float left_scale = left_upper - start;
  502. const int right_lower = (int)floor(end);
  503. const float right_scale = end - right_lower;
  504. if (left_scale > 0)
  505. {
  506. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
  507. }
  508. if (right_scale > 0)
  509. {
  510. expand_distribution[right_lower] += right_scale * quantize_distribution[j];
  511. }
  512. for (int k = left_upper; k < right_lower; k++)
  513. {
  514. expand_distribution[k] += quantize_distribution[j];
  515. }
  516. }
  517. {
  518. const float start = threshold - num_per_bin;
  519. const int left_upper = (int)ceil(start);
  520. const float left_scale = left_upper - start;
  521. if (left_scale > 0)
  522. {
  523. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
  524. }
  525. for (int k = left_upper; k < threshold; k++)
  526. {
  527. expand_distribution[k] += quantize_distribution[target_bin - 1];
  528. }
  529. }
  530. }
  531. // kl
  532. const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);
  533. // the best num of bin
  534. if (kl_divergence < min_kl_divergence)
  535. {
  536. min_kl_divergence = kl_divergence;
  537. target_threshold = threshold;
  538. }
  539. }
  540. stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
  541. float scale = 127 / stat.threshold;
  542. bottom_blob_scales[i].create(1);
  543. bottom_blob_scales[i][0] = scale;
  544. }
  545. return 0;
  546. }
  547. static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
  548. {
  549. const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
  550. const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));
  551. double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));
  552. return (float)(alpha_gaussian[num_bits - 1] * std);
  553. }
  554. int QuantNet::quantize_ACIQ()
  555. {
  556. const int input_blob_count = (int)input_blobs.size();
  557. const int conv_layer_count = (int)conv_layers.size();
  558. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  559. const int image_count = (int)listspaths[0].size();
  560. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  561. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  562. // initialize conv weight scales
  563. #pragma omp parallel for num_threads(quantize_num_threads)
  564. for (int i = 0; i < conv_layer_count; i++)
  565. {
  566. const ncnn::Layer* layer = layers[conv_layers[i]];
  567. if (layer->type == "Convolution")
  568. {
  569. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  570. const int num_output = convolution->num_output;
  571. const int kernel_w = convolution->kernel_w;
  572. const int kernel_h = convolution->kernel_h;
  573. const int dilation_w = convolution->dilation_w;
  574. const int dilation_h = convolution->dilation_h;
  575. const int stride_w = convolution->stride_w;
  576. const int stride_h = convolution->stride_h;
  577. const int weight_data_size_output = convolution->weight_data_size / num_output;
  578. // int8 winograd F43 needs weight data to use 6bit quantization
  579. // TODO proper condition for winograd 3x3 int8
  580. bool quant_6bit = false;
  581. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  582. quant_6bit = true;
  583. weight_scales[i].create(num_output);
  584. for (int n = 0; n < num_output; n++)
  585. {
  586. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  587. float absmax = 0.f;
  588. for (int k = 0; k < weight_data_size_output; k++)
  589. {
  590. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  591. }
  592. if (quant_6bit)
  593. {
  594. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
  595. weight_scales[i][n] = 31 / threshold;
  596. }
  597. else
  598. {
  599. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  600. weight_scales[i][n] = 127 / threshold;
  601. }
  602. }
  603. }
  604. if (layer->type == "ConvolutionDepthWise")
  605. {
  606. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  607. const int group = convolutiondepthwise->group;
  608. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  609. std::vector<float> scales;
  610. weight_scales[i].create(group);
  611. for (int n = 0; n < group; n++)
  612. {
  613. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  614. float absmax = 0.f;
  615. for (int k = 0; k < weight_data_size_output; k++)
  616. {
  617. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  618. }
  619. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  620. weight_scales[i][n] = 127 / threshold;
  621. }
  622. }
  623. if (layer->type == "InnerProduct")
  624. {
  625. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  626. const int num_output = innerproduct->num_output;
  627. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  628. weight_scales[i].create(num_output);
  629. for (int n = 0; n < num_output; n++)
  630. {
  631. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  632. float absmax = 0.f;
  633. for (int k = 0; k < weight_data_size_output; k++)
  634. {
  635. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  636. }
  637. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  638. weight_scales[i][n] = 127 / threshold;
  639. }
  640. }
  641. }
  642. // count the absmax
  643. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  644. for (int i = 0; i < image_count; i++)
  645. {
  646. if (i % 100 == 0)
  647. {
  648. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  649. }
  650. ncnn::Extractor ex = create_extractor();
  651. const int thread_num = ncnn::get_omp_thread_num();
  652. ex.set_blob_allocator(&blob_allocators[thread_num]);
  653. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  654. for (int j = 0; j < input_blob_count; j++)
  655. {
  656. const std::string& imagepath = listspaths[j][i];
  657. const std::vector<int>& shape = shapes[j];
  658. const int type_to_pixel = type_to_pixels[j];
  659. const std::vector<float>& mean_vals = means[j];
  660. const std::vector<float>& norm_vals = norms[j];
  661. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  662. if (type_to_pixel != pixel_convert_type)
  663. {
  664. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  665. }
  666. const int target_w = shape[0];
  667. const int target_h = shape[1];
  668. cv::Mat bgr = cv::imread(imagepath, 1);
  669. ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  670. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  671. ex.input(input_blobs[j], in);
  672. }
  673. for (int j = 0; j < conv_bottom_blob_count; j++)
  674. {
  675. ncnn::Mat out;
  676. ex.extract(conv_bottom_blobs[j], out);
  677. // count absmax
  678. {
  679. float absmax = 0.f;
  680. const int outc = out.c;
  681. const int outsize = out.w * out.h;
  682. for (int p = 0; p < outc; p++)
  683. {
  684. const float* ptr = out.channel(p);
  685. for (int k = 0; k < outsize; k++)
  686. {
  687. absmax = std::max(absmax, (float)fabs(ptr[k]));
  688. }
  689. }
  690. #pragma omp critical
  691. {
  692. QuantBlobStat& stat = quant_blob_stats[j];
  693. stat.absmax = std::max(stat.absmax, absmax);
  694. stat.total = outc * outsize;
  695. }
  696. }
  697. }
  698. }
  699. // alpha gaussian
  700. #pragma omp parallel for num_threads(quantize_num_threads)
  701. for (int i = 0; i < conv_bottom_blob_count; i++)
  702. {
  703. QuantBlobStat& stat = quant_blob_stats[i];
  704. stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
  705. float scale = 127 / stat.threshold;
  706. bottom_blob_scales[i].create(1);
  707. bottom_blob_scales[i][0] = scale;
  708. }
  709. return 0;
  710. }
  711. static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
  712. {
  713. const int chanenls = a.c;
  714. const int size = a.w * a.h;
  715. float sa = 0;
  716. float sb = 0;
  717. float sum = 0;
  718. for (int p = 0; p < chanenls; p++)
  719. {
  720. const float* pa = a.channel(p);
  721. const float* pb = b.channel(p);
  722. for (int i = 0; i < size; i++)
  723. {
  724. sa += pa[i] * pa[i];
  725. sb += pb[i] * pb[i];
  726. sum += pa[i] * pb[i];
  727. }
  728. }
  729. float sim = (float)sum / sqrt(sa) / sqrt(sb);
  730. return sim;
  731. }
  732. static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
  733. {
  734. if (layer->type == "Convolution")
  735. {
  736. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  737. pd.set(0, convolution->num_output);
  738. pd.set(1, convolution->kernel_w);
  739. pd.set(11, convolution->kernel_h);
  740. pd.set(2, convolution->dilation_w);
  741. pd.set(12, convolution->dilation_h);
  742. pd.set(3, convolution->stride_w);
  743. pd.set(13, convolution->stride_h);
  744. pd.set(4, convolution->pad_left);
  745. pd.set(15, convolution->pad_right);
  746. pd.set(14, convolution->pad_top);
  747. pd.set(16, convolution->pad_bottom);
  748. pd.set(18, convolution->pad_value);
  749. pd.set(5, convolution->bias_term);
  750. pd.set(6, convolution->weight_data_size);
  751. pd.set(8, convolution->int8_scale_term);
  752. pd.set(9, convolution->activation_type);
  753. pd.set(10, convolution->activation_params);
  754. }
  755. else if (layer->type == "ConvolutionDepthWise")
  756. {
  757. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  758. pd.set(0, convolutiondepthwise->num_output);
  759. pd.set(1, convolutiondepthwise->kernel_w);
  760. pd.set(11, convolutiondepthwise->kernel_h);
  761. pd.set(2, convolutiondepthwise->dilation_w);
  762. pd.set(12, convolutiondepthwise->dilation_h);
  763. pd.set(3, convolutiondepthwise->stride_w);
  764. pd.set(13, convolutiondepthwise->stride_h);
  765. pd.set(4, convolutiondepthwise->pad_left);
  766. pd.set(15, convolutiondepthwise->pad_right);
  767. pd.set(14, convolutiondepthwise->pad_top);
  768. pd.set(16, convolutiondepthwise->pad_bottom);
  769. pd.set(18, convolutiondepthwise->pad_value);
  770. pd.set(5, convolutiondepthwise->bias_term);
  771. pd.set(6, convolutiondepthwise->weight_data_size);
  772. pd.set(7, convolutiondepthwise->group);
  773. pd.set(8, convolutiondepthwise->int8_scale_term);
  774. pd.set(9, convolutiondepthwise->activation_type);
  775. pd.set(10, convolutiondepthwise->activation_params);
  776. }
  777. else if (layer->type == "InnerProduct")
  778. {
  779. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  780. pd.set(0, innerproduct->num_output);
  781. pd.set(1, innerproduct->bias_term);
  782. pd.set(2, innerproduct->weight_data_size);
  783. pd.set(8, innerproduct->int8_scale_term);
  784. pd.set(9, innerproduct->activation_type);
  785. pd.set(10, innerproduct->activation_params);
  786. }
  787. else
  788. {
  789. fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
  790. return -1;
  791. }
  792. return 0;
  793. }
  794. static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
  795. {
  796. if (layer->type == "Convolution")
  797. {
  798. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  799. weights.push_back(convolution->weight_data);
  800. if (convolution->bias_term)
  801. weights.push_back(convolution->bias_data);
  802. }
  803. else if (layer->type == "ConvolutionDepthWise")
  804. {
  805. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  806. weights.push_back(convolutiondepthwise->weight_data);
  807. if (convolutiondepthwise->bias_term)
  808. weights.push_back(convolutiondepthwise->bias_data);
  809. }
  810. else if (layer->type == "InnerProduct")
  811. {
  812. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  813. weights.push_back(innerproduct->weight_data);
  814. if (innerproduct->bias_term)
  815. weights.push_back(innerproduct->bias_data);
  816. }
  817. else
  818. {
  819. fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
  820. return -1;
  821. }
  822. return 0;
  823. }
  824. int QuantNet::quantize_EQ()
  825. {
  826. // find the initial scale via KL
  827. quantize_KL();
  828. print_quant_info();
  829. const int input_blob_count = (int)input_blobs.size();
  830. const int conv_layer_count = (int)conv_layers.size();
  831. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  832. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  833. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  834. // max 50 images for EQ
  835. const int image_count = std::min((int)listspaths[0].size(), 50);
  836. const float scale_range_lower = 0.5f;
  837. const float scale_range_upper = 2.0f;
  838. const int search_steps = 100;
  839. for (int i = 0; i < conv_layer_count; i++)
  840. {
  841. ncnn::Mat& weight_scale = weight_scales[i];
  842. ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  843. const ncnn::Layer* layer = layers[conv_layers[i]];
  844. // search weight scale
  845. for (int j = 0; j < weight_scale.w; j++)
  846. {
  847. const float scale = weight_scale[j];
  848. const float scale_lower = scale * scale_range_lower;
  849. const float scale_upper = scale * scale_range_upper;
  850. const float scale_step = (scale_upper - scale_lower) / search_steps;
  851. std::vector<double> avgsims(search_steps, 0.0);
  852. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  853. for (int ii = 0; ii < image_count; ii++)
  854. {
  855. if (ii % 100 == 0)
  856. {
  857. fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, weight_scale.w, i, conv_layer_count);
  858. }
  859. ncnn::Extractor ex = create_extractor();
  860. const int thread_num = ncnn::get_omp_thread_num();
  861. ex.set_blob_allocator(&blob_allocators[thread_num]);
  862. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  863. for (int jj = 0; jj < input_blob_count; jj++)
  864. {
  865. const std::string& imagepath = listspaths[jj][ii];
  866. const std::vector<int>& shape = shapes[jj];
  867. const int type_to_pixel = type_to_pixels[jj];
  868. const std::vector<float>& mean_vals = means[jj];
  869. const std::vector<float>& norm_vals = norms[jj];
  870. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  871. if (type_to_pixel != pixel_convert_type)
  872. {
  873. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  874. }
  875. const int target_w = shape[0];
  876. const int target_h = shape[1];
  877. cv::Mat bgr = cv::imread(imagepath, 1);
  878. ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  879. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  880. ex.input(input_blobs[jj], in);
  881. }
  882. ncnn::Mat in;
  883. ex.extract(conv_bottom_blobs[i], in);
  884. ncnn::Mat out;
  885. ex.extract(conv_top_blobs[i], out);
  886. ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
  887. ncnn::ParamDict pd;
  888. get_layer_param(layer, pd);
  889. pd.set(8, 1); //int8_scale_term
  890. layer_int8->load_param(pd);
  891. std::vector<float> sims(search_steps);
  892. for (int k = 0; k < search_steps; k++)
  893. {
  894. ncnn::Mat new_weight_scale = weight_scale.clone();
  895. new_weight_scale[j] = scale_lower + k * scale_step;
  896. std::vector<ncnn::Mat> weights;
  897. get_layer_weights(layer, weights);
  898. weights.push_back(new_weight_scale);
  899. weights.push_back(bottom_blob_scale);
  900. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  901. ncnn::Option opt_int8;
  902. opt_int8.use_packing_layout = false;
  903. layer_int8->create_pipeline(opt_int8);
  904. ncnn::Mat out_int8;
  905. layer_int8->forward(in, out_int8, opt_int8);
  906. layer_int8->destroy_pipeline(opt_int8);
  907. sims[k] = cosine_similarity(out, out_int8);
  908. }
  909. delete layer_int8;
  910. #pragma omp critical
  911. {
  912. for (int k = 0; k < search_steps; k++)
  913. {
  914. avgsims[k] += sims[k];
  915. }
  916. }
  917. }
  918. double max_avgsim = 0.0;
  919. float new_scale = scale;
  920. // find the scale with min cosine distance
  921. for (int k = 0; k < search_steps; k++)
  922. {
  923. if (max_avgsim < avgsims[k])
  924. {
  925. max_avgsim = avgsims[k];
  926. new_scale = scale_lower + k * scale_step;
  927. }
  928. }
  929. fprintf(stderr, "%s w %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  930. weight_scale[j] = new_scale;
  931. }
  932. // search bottom blob scale
  933. for (int j = 0; j < bottom_blob_scale.w; j++)
  934. {
  935. const float scale = bottom_blob_scale[j];
  936. const float scale_lower = scale * scale_range_lower;
  937. const float scale_upper = scale * scale_range_upper;
  938. const float scale_step = (scale_upper - scale_lower) / search_steps;
  939. std::vector<double> avgsims(search_steps, 0.0);
  940. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  941. for (int ii = 0; ii < image_count; ii++)
  942. {
  943. if (ii % 100 == 0)
  944. {
  945. fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, bottom_blob_scale.w, i, conv_layer_count);
  946. }
  947. ncnn::Extractor ex = create_extractor();
  948. const int thread_num = ncnn::get_omp_thread_num();
  949. ex.set_blob_allocator(&blob_allocators[thread_num]);
  950. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  951. for (int jj = 0; jj < input_blob_count; jj++)
  952. {
  953. const std::string& imagepath = listspaths[jj][ii];
  954. const std::vector<int>& shape = shapes[jj];
  955. const int type_to_pixel = type_to_pixels[jj];
  956. const std::vector<float>& mean_vals = means[jj];
  957. const std::vector<float>& norm_vals = norms[jj];
  958. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  959. if (type_to_pixel != pixel_convert_type)
  960. {
  961. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  962. }
  963. const int target_w = shape[0];
  964. const int target_h = shape[1];
  965. cv::Mat bgr = cv::imread(imagepath, 1);
  966. ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  967. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  968. ex.input(input_blobs[jj], in);
  969. }
  970. ncnn::Mat in;
  971. ex.extract(conv_bottom_blobs[i], in);
  972. ncnn::Mat out;
  973. ex.extract(conv_top_blobs[i], out);
  974. ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
  975. ncnn::ParamDict pd;
  976. get_layer_param(layer, pd);
  977. pd.set(8, 1); //int8_scale_term
  978. layer_int8->load_param(pd);
  979. std::vector<float> sims(search_steps);
  980. for (int k = 0; k < search_steps; k++)
  981. {
  982. ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
  983. new_bottom_blob_scale[j] = scale_lower + k * scale_step;
  984. std::vector<ncnn::Mat> weights;
  985. get_layer_weights(layer, weights);
  986. weights.push_back(weight_scale);
  987. weights.push_back(new_bottom_blob_scale);
  988. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  989. ncnn::Option opt_int8;
  990. opt_int8.use_packing_layout = false;
  991. layer_int8->create_pipeline(opt_int8);
  992. ncnn::Mat out_int8;
  993. layer_int8->forward(in, out_int8, opt_int8);
  994. layer_int8->destroy_pipeline(opt_int8);
  995. sims[k] = cosine_similarity(out, out_int8);
  996. }
  997. delete layer_int8;
  998. #pragma omp critical
  999. {
  1000. for (int k = 0; k < search_steps; k++)
  1001. {
  1002. avgsims[k] += sims[k];
  1003. }
  1004. }
  1005. }
  1006. double max_avgsim = 0.0;
  1007. float new_scale = scale;
  1008. // find the scale with min cosine distance
  1009. for (int k = 0; k < search_steps; k++)
  1010. {
  1011. if (max_avgsim < avgsims[k])
  1012. {
  1013. max_avgsim = avgsims[k];
  1014. new_scale = scale_lower + k * scale_step;
  1015. }
  1016. }
  1017. fprintf(stderr, "%s b %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  1018. bottom_blob_scale[j] = new_scale;
  1019. }
  1020. // update quant info
  1021. QuantBlobStat& stat = quant_blob_stats[i];
  1022. stat.threshold = 127 / bottom_blob_scale[0];
  1023. }
  1024. return 0;
  1025. }
  1026. static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
  1027. {
  1028. std::vector<std::vector<std::string> > aps;
  1029. char* pch = strtok(s, ",");
  1030. while (pch != NULL)
  1031. {
  1032. FILE* fp = fopen(pch, "rb");
  1033. if (!fp)
  1034. {
  1035. fprintf(stderr, "fopen %s failed\n", pch);
  1036. break;
  1037. }
  1038. std::vector<std::string> paths;
  1039. // one filepath per line
  1040. char line[1024];
  1041. while (!feof(fp))
  1042. {
  1043. char* ss = fgets(line, 1024, fp);
  1044. if (!ss)
  1045. break;
  1046. char filepath[256];
  1047. int nscan = sscanf(line, "%255s", filepath);
  1048. if (nscan != 1)
  1049. continue;
  1050. paths.push_back(std::string(filepath));
  1051. }
  1052. fclose(fp);
  1053. aps.push_back(paths);
  1054. pch = strtok(NULL, ",");
  1055. }
  1056. return aps;
  1057. }
  1058. static float vstr_to_float(const char vstr[20])
  1059. {
  1060. double v = 0.0;
  1061. const char* p = vstr;
  1062. // sign
  1063. bool sign = *p != '-';
  1064. if (*p == '+' || *p == '-')
  1065. {
  1066. p++;
  1067. }
  1068. // digits before decimal point or exponent
  1069. uint64_t v1 = 0;
  1070. while (isdigit(*p))
  1071. {
  1072. v1 = v1 * 10 + (*p - '0');
  1073. p++;
  1074. }
  1075. v = (double)v1;
  1076. // digits after decimal point
  1077. if (*p == '.')
  1078. {
  1079. p++;
  1080. uint64_t pow10 = 1;
  1081. uint64_t v2 = 0;
  1082. while (isdigit(*p))
  1083. {
  1084. v2 = v2 * 10 + (*p - '0');
  1085. pow10 *= 10;
  1086. p++;
  1087. }
  1088. v += v2 / (double)pow10;
  1089. }
  1090. // exponent
  1091. if (*p == 'e' || *p == 'E')
  1092. {
  1093. p++;
  1094. // sign of exponent
  1095. bool fact = *p != '-';
  1096. if (*p == '+' || *p == '-')
  1097. {
  1098. p++;
  1099. }
  1100. // digits of exponent
  1101. uint64_t expon = 0;
  1102. while (isdigit(*p))
  1103. {
  1104. expon = expon * 10 + (*p - '0');
  1105. p++;
  1106. }
  1107. double scale = 1.0;
  1108. while (expon >= 8)
  1109. {
  1110. scale *= 1e8;
  1111. expon -= 8;
  1112. }
  1113. while (expon > 0)
  1114. {
  1115. scale *= 10.0;
  1116. expon -= 1;
  1117. }
  1118. v = fact ? v * scale : v / scale;
  1119. }
  1120. // fprintf(stderr, "v = %f\n", v);
  1121. return sign ? (float)v : (float)-v;
  1122. }
  1123. static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
  1124. {
  1125. std::vector<std::vector<float> > aaf;
  1126. char* pch = strtok(s, "[]");
  1127. while (pch != NULL)
  1128. {
  1129. // parse a,b,c
  1130. char vstr[20];
  1131. int nconsumed = 0;
  1132. int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
  1133. if (nscan == 1)
  1134. {
  1135. // ok we get array
  1136. pch += nconsumed;
  1137. std::vector<float> af;
  1138. float v = vstr_to_float(vstr);
  1139. af.push_back(v);
  1140. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1141. while (nscan == 1)
  1142. {
  1143. pch += nconsumed;
  1144. float v = vstr_to_float(vstr);
  1145. af.push_back(v);
  1146. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1147. }
  1148. // array end
  1149. aaf.push_back(af);
  1150. }
  1151. pch = strtok(NULL, "[]");
  1152. }
  1153. return aaf;
  1154. }
  1155. static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
  1156. {
  1157. std::vector<std::vector<int> > aai;
  1158. char* pch = strtok(s, "[]");
  1159. while (pch != NULL)
  1160. {
  1161. // parse a,b,c
  1162. int v;
  1163. int nconsumed = 0;
  1164. int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
  1165. if (nscan == 1)
  1166. {
  1167. // ok we get array
  1168. pch += nconsumed;
  1169. std::vector<int> ai;
  1170. ai.push_back(v);
  1171. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1172. while (nscan == 1)
  1173. {
  1174. pch += nconsumed;
  1175. ai.push_back(v);
  1176. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1177. }
  1178. // array end
  1179. aai.push_back(ai);
  1180. }
  1181. pch = strtok(NULL, "[]");
  1182. }
  1183. return aai;
  1184. }
  1185. static std::vector<int> parse_comma_pixel_type_list(char* s)
  1186. {
  1187. std::vector<int> aps;
  1188. char* pch = strtok(s, ",");
  1189. while (pch != NULL)
  1190. {
  1191. // RAW/RGB/BGR/GRAY/RGBA/BGRA
  1192. if (strcmp(pch, "RAW") == 0)
  1193. aps.push_back(-233);
  1194. if (strcmp(pch, "RGB") == 0)
  1195. aps.push_back(ncnn::Mat::PIXEL_RGB);
  1196. if (strcmp(pch, "BGR") == 0)
  1197. aps.push_back(ncnn::Mat::PIXEL_BGR);
  1198. if (strcmp(pch, "GRAY") == 0)
  1199. aps.push_back(ncnn::Mat::PIXEL_GRAY);
  1200. if (strcmp(pch, "RGBA") == 0)
  1201. aps.push_back(ncnn::Mat::PIXEL_RGBA);
  1202. if (strcmp(pch, "BGRA") == 0)
  1203. aps.push_back(ncnn::Mat::PIXEL_BGRA);
  1204. pch = strtok(NULL, ",");
  1205. }
  1206. return aps;
  1207. }
  1208. static void print_float_array_list(const std::vector<std::vector<float> >& list)
  1209. {
  1210. for (size_t i = 0; i < list.size(); i++)
  1211. {
  1212. const std::vector<float>& array = list[i];
  1213. fprintf(stderr, "[");
  1214. for (size_t j = 0; j < array.size(); j++)
  1215. {
  1216. fprintf(stderr, "%f", array[j]);
  1217. if (j != array.size() - 1)
  1218. fprintf(stderr, ",");
  1219. }
  1220. fprintf(stderr, "]");
  1221. if (i != list.size() - 1)
  1222. fprintf(stderr, ",");
  1223. }
  1224. }
  1225. static void print_int_array_list(const std::vector<std::vector<int> >& list)
  1226. {
  1227. for (size_t i = 0; i < list.size(); i++)
  1228. {
  1229. const std::vector<int>& array = list[i];
  1230. fprintf(stderr, "[");
  1231. for (size_t j = 0; j < array.size(); j++)
  1232. {
  1233. fprintf(stderr, "%d", array[j]);
  1234. if (j != array.size() - 1)
  1235. fprintf(stderr, ",");
  1236. }
  1237. fprintf(stderr, "]");
  1238. if (i != list.size() - 1)
  1239. fprintf(stderr, ",");
  1240. }
  1241. }
  1242. static void print_pixel_type_list(const std::vector<int>& list)
  1243. {
  1244. for (size_t i = 0; i < list.size(); i++)
  1245. {
  1246. const int type = list[i];
  1247. if (type == -233)
  1248. fprintf(stderr, "RAW");
  1249. if (type == ncnn::Mat::PIXEL_RGB)
  1250. fprintf(stderr, "RGB");
  1251. if (type == ncnn::Mat::PIXEL_BGR)
  1252. fprintf(stderr, "BGR");
  1253. if (type == ncnn::Mat::PIXEL_GRAY)
  1254. fprintf(stderr, "GRAY");
  1255. if (type == ncnn::Mat::PIXEL_RGBA)
  1256. fprintf(stderr, "RGBA");
  1257. if (type == ncnn::Mat::PIXEL_BGRA)
  1258. fprintf(stderr, "BGRA");
  1259. if (i != list.size() - 1)
  1260. fprintf(stderr, ",");
  1261. }
  1262. }
  1263. static void show_usage()
  1264. {
  1265. fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
  1266. fprintf(stderr, " mean=[104.0,117.0,123.0],...\n");
  1267. fprintf(stderr, " norm=[1.0,1.0,1.0],...\n");
  1268. fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h]\n");
  1269. fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
  1270. fprintf(stderr, " thread=8\n");
  1271. fprintf(stderr, " method=kl/aciq/eq\n");
  1272. fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
  1273. }
  1274. int main(int argc, char** argv)
  1275. {
  1276. if (argc < 5)
  1277. {
  1278. show_usage();
  1279. return -1;
  1280. }
  1281. for (int i = 1; i < argc; i++)
  1282. {
  1283. if (argv[i][0] == '-')
  1284. {
  1285. show_usage();
  1286. return -1;
  1287. }
  1288. }
  1289. const char* inparam = argv[1];
  1290. const char* inbin = argv[2];
  1291. char* lists = argv[3];
  1292. const char* outtable = argv[4];
  1293. ncnn::Option opt;
  1294. opt.num_threads = 1;
  1295. opt.use_fp16_packed = false;
  1296. opt.use_fp16_storage = false;
  1297. opt.use_fp16_arithmetic = false;
  1298. QuantNet net;
  1299. net.opt = opt;
  1300. net.load_param(inparam);
  1301. net.load_model(inbin);
  1302. net.init();
  1303. // load lists
  1304. net.listspaths = parse_comma_path_list(lists);
  1305. std::string method = "kl";
  1306. for (int i = 5; i < argc; i++)
  1307. {
  1308. // key=value
  1309. char* kv = argv[i];
  1310. char* eqs = strchr(kv, '=');
  1311. if (eqs == NULL)
  1312. {
  1313. fprintf(stderr, "unrecognized arg %s\n", kv);
  1314. continue;
  1315. }
  1316. // split k v
  1317. eqs[0] = '\0';
  1318. const char* key = kv;
  1319. char* value = eqs + 1;
  1320. // load mean norm shape
  1321. if (memcmp(key, "mean", 4) == 0)
  1322. net.means = parse_comma_float_array_list(value);
  1323. if (memcmp(key, "norm", 4) == 0)
  1324. net.norms = parse_comma_float_array_list(value);
  1325. if (memcmp(key, "shape", 5) == 0)
  1326. net.shapes = parse_comma_int_array_list(value);
  1327. if (memcmp(key, "pixel", 5) == 0)
  1328. net.type_to_pixels = parse_comma_pixel_type_list(value);
  1329. if (memcmp(key, "thread", 6) == 0)
  1330. net.quantize_num_threads = atoi(value);
  1331. if (memcmp(key, "method", 6) == 0)
  1332. method = std::string(value);
  1333. }
  1334. // sanity check
  1335. const size_t input_blob_count = net.input_blobs.size();
  1336. if (net.listspaths.size() != input_blob_count)
  1337. {
  1338. fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
  1339. return -1;
  1340. }
  1341. if (net.means.size() != input_blob_count)
  1342. {
  1343. fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
  1344. return -1;
  1345. }
  1346. if (net.norms.size() != input_blob_count)
  1347. {
  1348. fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
  1349. return -1;
  1350. }
  1351. if (net.shapes.size() != input_blob_count)
  1352. {
  1353. fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
  1354. return -1;
  1355. }
  1356. if (net.type_to_pixels.size() != input_blob_count)
  1357. {
  1358. fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
  1359. return -1;
  1360. }
  1361. if (net.quantize_num_threads < 0)
  1362. {
  1363. fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
  1364. return -1;
  1365. }
  1366. // print quantnet config
  1367. {
  1368. fprintf(stderr, "mean = ");
  1369. print_float_array_list(net.means);
  1370. fprintf(stderr, "\n");
  1371. fprintf(stderr, "norm = ");
  1372. print_float_array_list(net.norms);
  1373. fprintf(stderr, "\n");
  1374. fprintf(stderr, "shape = ");
  1375. print_int_array_list(net.shapes);
  1376. fprintf(stderr, "\n");
  1377. fprintf(stderr, "pixel = ");
  1378. print_pixel_type_list(net.type_to_pixels);
  1379. fprintf(stderr, "\n");
  1380. fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
  1381. fprintf(stderr, "method = %s\n", method.c_str());
  1382. fprintf(stderr, "---------------------------------------\n");
  1383. }
  1384. if (method == "kl")
  1385. {
  1386. net.quantize_KL();
  1387. }
  1388. else if (method == "aciq")
  1389. {
  1390. net.quantize_ACIQ();
  1391. }
  1392. else if (method == "eq")
  1393. {
  1394. net.quantize_EQ();
  1395. }
  1396. else
  1397. {
  1398. fprintf(stderr, "not implemented yet !\n");
  1399. fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
  1400. return -1;
  1401. }
  1402. net.print_quant_info();
  1403. net.save_table(outtable);
  1404. return 0;
  1405. }