You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ncnn2table.cpp 56 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // author:BUG1989 (https://github.com/BUG1989/) Long-term support.
  4. // author:JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
  5. //
  6. // Copyright (C) 2019 BUG1989. All rights reserved.
  7. // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
  8. //
  9. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  10. // in compliance with the License. You may obtain a copy of the License at
  11. //
  12. // https://opensource.org/licenses/BSD-3-Clause
  13. //
  14. // Unless required by applicable law or agreed to in writing, software distributed
  15. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  16. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  17. // specific language governing permissions and limitations under the License.
  18. #ifdef _MSC_VER
  19. #define _CRT_SECURE_NO_DEPRECATE
  20. #endif
  21. #include <float.h>
  22. #include <limits.h>
  23. #include <math.h>
  24. #include <stdio.h>
  25. #include <stdint.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #if defined(USE_NCNN_SIMPLEOCV)
  29. #include "simpleocv.h"
  30. #elif defined(USE_LOCAL_IMREADWRITE)
  31. #include "imreadwrite.h"
  32. #else
  33. #include <opencv2/core/core.hpp>
  34. #include <opencv2/highgui/highgui.hpp>
  35. #endif
  36. #include <string>
  37. #include <vector>
  38. // ncnn public header
  39. #include "benchmark.h"
  40. #include "cpu.h"
  41. #include "net.h"
  42. // ncnn private header
  43. #include "layer/convolution.h"
  44. #include "layer/convolutiondepthwise.h"
  45. #include "layer/innerproduct.h"
  46. class QuantBlobStat
  47. {
  48. public:
  49. QuantBlobStat()
  50. {
  51. threshold = 0.f;
  52. absmax = 0.f;
  53. total = 0;
  54. }
  55. public:
  56. float threshold;
  57. float absmax;
  58. // ACIQ
  59. int total;
  60. // KL
  61. std::vector<uint64_t> histogram;
  62. std::vector<float> histogram_normed;
  63. };
  64. class QuantNet : public ncnn::Net
  65. {
  66. public:
  67. QuantNet();
  68. std::vector<ncnn::Blob>& blobs;
  69. std::vector<ncnn::Layer*>& layers;
  70. public:
  71. std::vector<std::vector<std::string> > listspaths;
  72. std::vector<std::vector<float> > means;
  73. std::vector<std::vector<float> > norms;
  74. std::vector<std::vector<int> > shapes;
  75. std::vector<int> type_to_pixels;
  76. int quantize_num_threads;
  77. public:
  78. int init();
  79. void print_quant_info() const;
  80. int save_table(const char* tablepath);
  81. int quantize_KL();
  82. int quantize_ACIQ();
  83. int quantize_EQ();
  84. public:
  85. std::vector<int> input_blobs;
  86. std::vector<int> conv_layers;
  87. std::vector<int> conv_bottom_blobs;
  88. std::vector<int> conv_top_blobs;
  89. // result
  90. std::vector<QuantBlobStat> quant_blob_stats;
  91. std::vector<ncnn::Mat> weight_scales;
  92. std::vector<ncnn::Mat> bottom_blob_scales;
  93. };
  94. QuantNet::QuantNet()
  95. : blobs(mutable_blobs()), layers(mutable_layers())
  96. {
  97. quantize_num_threads = ncnn::get_cpu_count();
  98. }
  99. int QuantNet::init()
  100. {
  101. // find all input layers
  102. for (int i = 0; i < (int)layers.size(); i++)
  103. {
  104. const ncnn::Layer* layer = layers[i];
  105. if (layer->type == "Input")
  106. {
  107. input_blobs.push_back(layer->tops[0]);
  108. }
  109. }
  110. // find all conv layers
  111. for (int i = 0; i < (int)layers.size(); i++)
  112. {
  113. const ncnn::Layer* layer = layers[i];
  114. if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct")
  115. {
  116. conv_layers.push_back(i);
  117. conv_bottom_blobs.push_back(layer->bottoms[0]);
  118. conv_top_blobs.push_back(layer->tops[0]);
  119. }
  120. }
  121. const int conv_layer_count = (int)conv_layers.size();
  122. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  123. quant_blob_stats.resize(conv_bottom_blob_count);
  124. weight_scales.resize(conv_layer_count);
  125. bottom_blob_scales.resize(conv_bottom_blob_count);
  126. return 0;
  127. }
  128. int QuantNet::save_table(const char* tablepath)
  129. {
  130. FILE* fp = fopen(tablepath, "wb");
  131. if (!fp)
  132. {
  133. fprintf(stderr, "fopen %s failed\n", tablepath);
  134. return -1;
  135. }
  136. const int conv_layer_count = (int)conv_layers.size();
  137. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  138. for (int i = 0; i < conv_layer_count; i++)
  139. {
  140. const ncnn::Mat& weight_scale = weight_scales[i];
  141. fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
  142. for (int j = 0; j < weight_scale.w; j++)
  143. {
  144. fprintf(fp, "%f ", weight_scale[j]);
  145. }
  146. fprintf(fp, "\n");
  147. }
  148. for (int i = 0; i < conv_bottom_blob_count; i++)
  149. {
  150. const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  151. fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
  152. for (int j = 0; j < bottom_blob_scale.w; j++)
  153. {
  154. fprintf(fp, "%f ", bottom_blob_scale[j]);
  155. }
  156. fprintf(fp, "\n");
  157. }
  158. fclose(fp);
  159. fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");
  160. return 0;
  161. }
  162. void QuantNet::print_quant_info() const
  163. {
  164. for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
  165. {
  166. const QuantBlobStat& stat = quant_blob_stats[i];
  167. float scale = 127 / stat.threshold;
  168. fprintf(stderr, "%-40s : max = %-15f threshold = %-15f scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
  169. }
  170. }
  171. /**
  172. * Read and resize image
  173. * shape is input as [w,h,...]
  174. * if w and h both are given, image will be resized to exactly size.
  175. * if w and h both are zero or negative, image will not be resized.
  176. * if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
  177. * if only w is zero or negative, image's height will scaled resize to h
  178. * @return ncnn::Mat
  179. */
  180. inline ncnn::Mat read_and_resize_image(const std::vector<int>& shape, const std::string& imagepath, int pixel_convert_type)
  181. {
  182. int target_w = shape[0];
  183. int target_h = shape[1];
  184. cv::Mat bgr = cv::imread(imagepath, 1);
  185. if (target_h <= 0 && target_w <= 0)
  186. {
  187. return ncnn::Mat::from_pixels(bgr.data, pixel_convert_type, bgr.cols, bgr.rows);
  188. }
  189. if (target_h <= 0 || target_w <= 0)
  190. {
  191. float scale = 1.0;
  192. if (target_h <= 0)
  193. {
  194. scale = 1.0 * bgr.cols / target_w;
  195. target_h = int(1.0 * bgr.rows / scale);
  196. }
  197. if (target_w <= 0)
  198. {
  199. scale = 1.0 * bgr.rows / target_h;
  200. target_w = int(1.0 * bgr.cols / scale);
  201. }
  202. }
  203. return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  204. }
  205. static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
  206. {
  207. const size_t length = a.size();
  208. float result = 0;
  209. for (size_t i = 0; i < length; i++)
  210. {
  211. result += a[i] * log(a[i] / b[i]);
  212. }
  213. return result;
  214. }
  215. int QuantNet::quantize_KL()
  216. {
  217. const int input_blob_count = (int)input_blobs.size();
  218. const int conv_layer_count = (int)conv_layers.size();
  219. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  220. const int image_count = (int)listspaths[0].size();
  221. const int num_histogram_bins = 2048;
  222. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  223. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  224. // initialize conv weight scales
  225. #pragma omp parallel for num_threads(quantize_num_threads)
  226. for (int i = 0; i < conv_layer_count; i++)
  227. {
  228. const ncnn::Layer* layer = layers[conv_layers[i]];
  229. if (layer->type == "Convolution")
  230. {
  231. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  232. const int num_output = convolution->num_output;
  233. const int kernel_w = convolution->kernel_w;
  234. const int kernel_h = convolution->kernel_h;
  235. const int dilation_w = convolution->dilation_w;
  236. const int dilation_h = convolution->dilation_h;
  237. const int stride_w = convolution->stride_w;
  238. const int stride_h = convolution->stride_h;
  239. const int weight_data_size_output = convolution->weight_data_size / num_output;
  240. // int8 winograd F43 needs weight data to use 6bit quantization
  241. // TODO proper condition for winograd 3x3 int8
  242. bool quant_6bit = false;
  243. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  244. quant_6bit = true;
  245. weight_scales[i].create(num_output);
  246. for (int n = 0; n < num_output; n++)
  247. {
  248. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  249. float absmax = 0.f;
  250. for (int k = 0; k < weight_data_size_output; k++)
  251. {
  252. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  253. }
  254. if (quant_6bit)
  255. {
  256. weight_scales[i][n] = 31 / absmax;
  257. }
  258. else
  259. {
  260. weight_scales[i][n] = 127 / absmax;
  261. }
  262. }
  263. }
  264. if (layer->type == "ConvolutionDepthWise")
  265. {
  266. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  267. const int group = convolutiondepthwise->group;
  268. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  269. std::vector<float> scales;
  270. weight_scales[i].create(group);
  271. for (int n = 0; n < group; n++)
  272. {
  273. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  274. float absmax = 0.f;
  275. for (int k = 0; k < weight_data_size_output; k++)
  276. {
  277. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  278. }
  279. weight_scales[i][n] = 127 / absmax;
  280. }
  281. }
  282. if (layer->type == "InnerProduct")
  283. {
  284. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  285. const int num_output = innerproduct->num_output;
  286. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  287. weight_scales[i].create(num_output);
  288. for (int n = 0; n < num_output; n++)
  289. {
  290. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  291. float absmax = 0.f;
  292. for (int k = 0; k < weight_data_size_output; k++)
  293. {
  294. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  295. }
  296. weight_scales[i][n] = 127 / absmax;
  297. }
  298. }
  299. }
  300. // count the absmax
  301. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  302. for (int i = 0; i < image_count; i++)
  303. {
  304. if (i % 100 == 0)
  305. {
  306. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  307. }
  308. ncnn::Extractor ex = create_extractor();
  309. ex.set_light_mode(true);
  310. const int thread_num = ncnn::get_omp_thread_num();
  311. ex.set_blob_allocator(&blob_allocators[thread_num]);
  312. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  313. for (int j = 0; j < input_blob_count; j++)
  314. {
  315. const int type_to_pixel = type_to_pixels[j];
  316. const std::vector<float>& mean_vals = means[j];
  317. const std::vector<float>& norm_vals = norms[j];
  318. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  319. if (type_to_pixel != pixel_convert_type)
  320. {
  321. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  322. }
  323. ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  324. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  325. ex.input(input_blobs[j], in);
  326. }
  327. for (int j = 0; j < conv_bottom_blob_count; j++)
  328. {
  329. ncnn::Mat out;
  330. ex.extract(conv_bottom_blobs[j], out);
  331. // count absmax
  332. {
  333. float absmax = 0.f;
  334. const int outc = out.c;
  335. const int outsize = out.w * out.h;
  336. for (int p = 0; p < outc; p++)
  337. {
  338. const float* ptr = out.channel(p);
  339. for (int k = 0; k < outsize; k++)
  340. {
  341. absmax = std::max(absmax, (float)fabs(ptr[k]));
  342. }
  343. }
  344. #pragma omp critical
  345. {
  346. QuantBlobStat& stat = quant_blob_stats[j];
  347. stat.absmax = std::max(stat.absmax, absmax);
  348. }
  349. }
  350. }
  351. }
  352. // initialize histogram
  353. #pragma omp parallel for num_threads(quantize_num_threads)
  354. for (int i = 0; i < conv_bottom_blob_count; i++)
  355. {
  356. QuantBlobStat& stat = quant_blob_stats[i];
  357. stat.histogram.resize(num_histogram_bins, 0);
  358. stat.histogram_normed.resize(num_histogram_bins, 0);
  359. }
  360. // build histogram
  361. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  362. for (int i = 0; i < image_count; i++)
  363. {
  364. if (i % 100 == 0)
  365. {
  366. fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  367. }
  368. ncnn::Extractor ex = create_extractor();
  369. ex.set_light_mode(true);
  370. const int thread_num = ncnn::get_omp_thread_num();
  371. ex.set_blob_allocator(&blob_allocators[thread_num]);
  372. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  373. for (int j = 0; j < input_blob_count; j++)
  374. {
  375. const int type_to_pixel = type_to_pixels[j];
  376. const std::vector<float>& mean_vals = means[j];
  377. const std::vector<float>& norm_vals = norms[j];
  378. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  379. if (type_to_pixel != pixel_convert_type)
  380. {
  381. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  382. }
  383. ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  384. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  385. ex.input(input_blobs[j], in);
  386. }
  387. for (int j = 0; j < conv_bottom_blob_count; j++)
  388. {
  389. ncnn::Mat out;
  390. ex.extract(conv_bottom_blobs[j], out);
  391. // count histogram bin
  392. {
  393. const float absmax = quant_blob_stats[j].absmax;
  394. std::vector<uint64_t> histogram(num_histogram_bins, 0);
  395. const int outc = out.c;
  396. const int outsize = out.w * out.h;
  397. for (int p = 0; p < outc; p++)
  398. {
  399. const float* ptr = out.channel(p);
  400. for (int k = 0; k < outsize; k++)
  401. {
  402. if (ptr[k] == 0.f)
  403. continue;
  404. const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));
  405. histogram[index] += 1;
  406. }
  407. }
  408. #pragma omp critical
  409. {
  410. QuantBlobStat& stat = quant_blob_stats[j];
  411. for (int k = 0; k < num_histogram_bins; k++)
  412. {
  413. stat.histogram[k] += histogram[k];
  414. }
  415. }
  416. }
  417. }
  418. }
  419. // using kld to find the best threshold value
  420. #pragma omp parallel for num_threads(quantize_num_threads)
  421. for (int i = 0; i < conv_bottom_blob_count; i++)
  422. {
  423. QuantBlobStat& stat = quant_blob_stats[i];
  424. // normalize histogram bin
  425. {
  426. uint64_t sum = 0;
  427. for (int j = 0; j < num_histogram_bins; j++)
  428. {
  429. sum += stat.histogram[j];
  430. }
  431. for (int j = 0; j < num_histogram_bins; j++)
  432. {
  433. stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
  434. }
  435. }
  436. const int target_bin = 128;
  437. int target_threshold = target_bin;
  438. float min_kl_divergence = FLT_MAX;
  439. for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
  440. {
  441. const float kl_eps = 0.0001f;
  442. std::vector<float> clip_distribution(threshold, kl_eps);
  443. {
  444. for (int j = 0; j < threshold; j++)
  445. {
  446. clip_distribution[j] += stat.histogram_normed[j];
  447. }
  448. for (int j = threshold; j < num_histogram_bins; j++)
  449. {
  450. clip_distribution[threshold - 1] += stat.histogram_normed[j];
  451. }
  452. }
  453. const float num_per_bin = (float)threshold / target_bin;
  454. std::vector<float> quantize_distribution(target_bin, 0.f);
  455. {
  456. {
  457. const float end = num_per_bin;
  458. const int right_lower = (int)floor(end);
  459. const float right_scale = end - right_lower;
  460. if (right_scale > 0)
  461. {
  462. quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
  463. }
  464. for (int k = 0; k < right_lower; k++)
  465. {
  466. quantize_distribution[0] += stat.histogram_normed[k];
  467. }
  468. quantize_distribution[0] /= right_lower + right_scale;
  469. }
  470. for (int j = 1; j < target_bin - 1; j++)
  471. {
  472. const float start = j * num_per_bin;
  473. const float end = (j + 1) * num_per_bin;
  474. const int left_upper = (int)ceil(start);
  475. const float left_scale = left_upper - start;
  476. const int right_lower = (int)floor(end);
  477. const float right_scale = end - right_lower;
  478. if (left_scale > 0)
  479. {
  480. quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
  481. }
  482. if (right_scale > 0)
  483. {
  484. quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
  485. }
  486. for (int k = left_upper; k < right_lower; k++)
  487. {
  488. quantize_distribution[j] += stat.histogram_normed[k];
  489. }
  490. quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
  491. }
  492. {
  493. const float start = threshold - num_per_bin;
  494. const int left_upper = (int)ceil(start);
  495. const float left_scale = left_upper - start;
  496. if (left_scale > 0)
  497. {
  498. quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
  499. }
  500. for (int k = left_upper; k < threshold; k++)
  501. {
  502. quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
  503. }
  504. quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
  505. }
  506. }
  507. std::vector<float> expand_distribution(threshold, kl_eps);
  508. {
  509. {
  510. const float end = num_per_bin;
  511. const int right_lower = (int)floor(end);
  512. const float right_scale = end - right_lower;
  513. if (right_scale > 0)
  514. {
  515. expand_distribution[right_lower] += right_scale * quantize_distribution[0];
  516. }
  517. for (int k = 0; k < right_lower; k++)
  518. {
  519. expand_distribution[k] += quantize_distribution[0];
  520. }
  521. }
  522. for (int j = 1; j < target_bin - 1; j++)
  523. {
  524. const float start = j * num_per_bin;
  525. const float end = (j + 1) * num_per_bin;
  526. const int left_upper = (int)ceil(start);
  527. const float left_scale = left_upper - start;
  528. const int right_lower = (int)floor(end);
  529. const float right_scale = end - right_lower;
  530. if (left_scale > 0)
  531. {
  532. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
  533. }
  534. if (right_scale > 0)
  535. {
  536. expand_distribution[right_lower] += right_scale * quantize_distribution[j];
  537. }
  538. for (int k = left_upper; k < right_lower; k++)
  539. {
  540. expand_distribution[k] += quantize_distribution[j];
  541. }
  542. }
  543. {
  544. const float start = threshold - num_per_bin;
  545. const int left_upper = (int)ceil(start);
  546. const float left_scale = left_upper - start;
  547. if (left_scale > 0)
  548. {
  549. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
  550. }
  551. for (int k = left_upper; k < threshold; k++)
  552. {
  553. expand_distribution[k] += quantize_distribution[target_bin - 1];
  554. }
  555. }
  556. }
  557. // kl
  558. const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);
  559. // the best num of bin
  560. if (kl_divergence < min_kl_divergence)
  561. {
  562. min_kl_divergence = kl_divergence;
  563. target_threshold = threshold;
  564. }
  565. }
  566. stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
  567. float scale = 127 / stat.threshold;
  568. bottom_blob_scales[i].create(1);
  569. bottom_blob_scales[i][0] = scale;
  570. }
  571. return 0;
  572. }
  573. static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
  574. {
  575. const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
  576. const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));
  577. double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));
  578. return (float)(alpha_gaussian[num_bits - 1] * std);
  579. }
  580. int QuantNet::quantize_ACIQ()
  581. {
  582. const int input_blob_count = (int)input_blobs.size();
  583. const int conv_layer_count = (int)conv_layers.size();
  584. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  585. const int image_count = (int)listspaths[0].size();
  586. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  587. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  588. // initialize conv weight scales
  589. #pragma omp parallel for num_threads(quantize_num_threads)
  590. for (int i = 0; i < conv_layer_count; i++)
  591. {
  592. const ncnn::Layer* layer = layers[conv_layers[i]];
  593. if (layer->type == "Convolution")
  594. {
  595. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  596. const int num_output = convolution->num_output;
  597. const int kernel_w = convolution->kernel_w;
  598. const int kernel_h = convolution->kernel_h;
  599. const int dilation_w = convolution->dilation_w;
  600. const int dilation_h = convolution->dilation_h;
  601. const int stride_w = convolution->stride_w;
  602. const int stride_h = convolution->stride_h;
  603. const int weight_data_size_output = convolution->weight_data_size / num_output;
  604. // int8 winograd F43 needs weight data to use 6bit quantization
  605. // TODO proper condition for winograd 3x3 int8
  606. bool quant_6bit = false;
  607. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  608. quant_6bit = true;
  609. weight_scales[i].create(num_output);
  610. for (int n = 0; n < num_output; n++)
  611. {
  612. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  613. float absmax = 0.f;
  614. for (int k = 0; k < weight_data_size_output; k++)
  615. {
  616. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  617. }
  618. if (quant_6bit)
  619. {
  620. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
  621. weight_scales[i][n] = 31 / threshold;
  622. }
  623. else
  624. {
  625. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  626. weight_scales[i][n] = 127 / threshold;
  627. }
  628. }
  629. }
  630. if (layer->type == "ConvolutionDepthWise")
  631. {
  632. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  633. const int group = convolutiondepthwise->group;
  634. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  635. std::vector<float> scales;
  636. weight_scales[i].create(group);
  637. for (int n = 0; n < group; n++)
  638. {
  639. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  640. float absmax = 0.f;
  641. for (int k = 0; k < weight_data_size_output; k++)
  642. {
  643. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  644. }
  645. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  646. weight_scales[i][n] = 127 / threshold;
  647. }
  648. }
  649. if (layer->type == "InnerProduct")
  650. {
  651. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  652. const int num_output = innerproduct->num_output;
  653. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  654. weight_scales[i].create(num_output);
  655. for (int n = 0; n < num_output; n++)
  656. {
  657. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  658. float absmax = 0.f;
  659. for (int k = 0; k < weight_data_size_output; k++)
  660. {
  661. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  662. }
  663. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  664. weight_scales[i][n] = 127 / threshold;
  665. }
  666. }
  667. }
  668. // count the absmax
  669. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  670. for (int i = 0; i < image_count; i++)
  671. {
  672. if (i % 100 == 0)
  673. {
  674. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
  675. }
  676. ncnn::Extractor ex = create_extractor();
  677. ex.set_light_mode(true);
  678. const int thread_num = ncnn::get_omp_thread_num();
  679. ex.set_blob_allocator(&blob_allocators[thread_num]);
  680. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  681. for (int j = 0; j < input_blob_count; j++)
  682. {
  683. const int type_to_pixel = type_to_pixels[j];
  684. const std::vector<float>& mean_vals = means[j];
  685. const std::vector<float>& norm_vals = norms[j];
  686. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  687. if (type_to_pixel != pixel_convert_type)
  688. {
  689. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  690. }
  691. ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  692. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  693. ex.input(input_blobs[j], in);
  694. }
  695. for (int j = 0; j < conv_bottom_blob_count; j++)
  696. {
  697. ncnn::Mat out;
  698. ex.extract(conv_bottom_blobs[j], out);
  699. // count absmax
  700. {
  701. float absmax = 0.f;
  702. const int outc = out.c;
  703. const int outsize = out.w * out.h;
  704. for (int p = 0; p < outc; p++)
  705. {
  706. const float* ptr = out.channel(p);
  707. for (int k = 0; k < outsize; k++)
  708. {
  709. absmax = std::max(absmax, (float)fabs(ptr[k]));
  710. }
  711. }
  712. #pragma omp critical
  713. {
  714. QuantBlobStat& stat = quant_blob_stats[j];
  715. stat.absmax = std::max(stat.absmax, absmax);
  716. stat.total = outc * outsize;
  717. }
  718. }
  719. }
  720. }
  721. // alpha gaussian
  722. #pragma omp parallel for num_threads(quantize_num_threads)
  723. for (int i = 0; i < conv_bottom_blob_count; i++)
  724. {
  725. QuantBlobStat& stat = quant_blob_stats[i];
  726. stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
  727. float scale = 127 / stat.threshold;
  728. bottom_blob_scales[i].create(1);
  729. bottom_blob_scales[i][0] = scale;
  730. }
  731. return 0;
  732. }
  733. static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
  734. {
  735. const int chanenls = a.c;
  736. const int size = a.w * a.h;
  737. float sa = 0;
  738. float sb = 0;
  739. float sum = 0;
  740. for (int p = 0; p < chanenls; p++)
  741. {
  742. const float* pa = a.channel(p);
  743. const float* pb = b.channel(p);
  744. for (int i = 0; i < size; i++)
  745. {
  746. sa += pa[i] * pa[i];
  747. sb += pb[i] * pb[i];
  748. sum += pa[i] * pb[i];
  749. }
  750. }
  751. float sim = (float)sum / sqrt(sa) / sqrt(sb);
  752. return sim;
  753. }
  754. static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
  755. {
  756. if (layer->type == "Convolution")
  757. {
  758. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  759. pd.set(0, convolution->num_output);
  760. pd.set(1, convolution->kernel_w);
  761. pd.set(11, convolution->kernel_h);
  762. pd.set(2, convolution->dilation_w);
  763. pd.set(12, convolution->dilation_h);
  764. pd.set(3, convolution->stride_w);
  765. pd.set(13, convolution->stride_h);
  766. pd.set(4, convolution->pad_left);
  767. pd.set(15, convolution->pad_right);
  768. pd.set(14, convolution->pad_top);
  769. pd.set(16, convolution->pad_bottom);
  770. pd.set(18, convolution->pad_value);
  771. pd.set(5, convolution->bias_term);
  772. pd.set(6, convolution->weight_data_size);
  773. pd.set(8, convolution->int8_scale_term);
  774. pd.set(9, convolution->activation_type);
  775. pd.set(10, convolution->activation_params);
  776. }
  777. else if (layer->type == "ConvolutionDepthWise")
  778. {
  779. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  780. pd.set(0, convolutiondepthwise->num_output);
  781. pd.set(1, convolutiondepthwise->kernel_w);
  782. pd.set(11, convolutiondepthwise->kernel_h);
  783. pd.set(2, convolutiondepthwise->dilation_w);
  784. pd.set(12, convolutiondepthwise->dilation_h);
  785. pd.set(3, convolutiondepthwise->stride_w);
  786. pd.set(13, convolutiondepthwise->stride_h);
  787. pd.set(4, convolutiondepthwise->pad_left);
  788. pd.set(15, convolutiondepthwise->pad_right);
  789. pd.set(14, convolutiondepthwise->pad_top);
  790. pd.set(16, convolutiondepthwise->pad_bottom);
  791. pd.set(18, convolutiondepthwise->pad_value);
  792. pd.set(5, convolutiondepthwise->bias_term);
  793. pd.set(6, convolutiondepthwise->weight_data_size);
  794. pd.set(7, convolutiondepthwise->group);
  795. pd.set(8, convolutiondepthwise->int8_scale_term);
  796. pd.set(9, convolutiondepthwise->activation_type);
  797. pd.set(10, convolutiondepthwise->activation_params);
  798. }
  799. else if (layer->type == "InnerProduct")
  800. {
  801. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  802. pd.set(0, innerproduct->num_output);
  803. pd.set(1, innerproduct->bias_term);
  804. pd.set(2, innerproduct->weight_data_size);
  805. pd.set(8, innerproduct->int8_scale_term);
  806. pd.set(9, innerproduct->activation_type);
  807. pd.set(10, innerproduct->activation_params);
  808. }
  809. else
  810. {
  811. fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
  812. return -1;
  813. }
  814. return 0;
  815. }
  816. static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
  817. {
  818. if (layer->type == "Convolution")
  819. {
  820. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  821. weights.push_back(convolution->weight_data);
  822. if (convolution->bias_term)
  823. weights.push_back(convolution->bias_data);
  824. }
  825. else if (layer->type == "ConvolutionDepthWise")
  826. {
  827. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  828. weights.push_back(convolutiondepthwise->weight_data);
  829. if (convolutiondepthwise->bias_term)
  830. weights.push_back(convolutiondepthwise->bias_data);
  831. }
  832. else if (layer->type == "InnerProduct")
  833. {
  834. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  835. weights.push_back(innerproduct->weight_data);
  836. if (innerproduct->bias_term)
  837. weights.push_back(innerproduct->bias_data);
  838. }
  839. else
  840. {
  841. fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
  842. return -1;
  843. }
  844. return 0;
  845. }
  846. int QuantNet::quantize_EQ()
  847. {
  848. // find the initial scale via KL
  849. quantize_KL();
  850. print_quant_info();
  851. const int input_blob_count = (int)input_blobs.size();
  852. const int conv_layer_count = (int)conv_layers.size();
  853. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  854. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  855. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  856. // max 50 images for EQ
  857. const int image_count = std::min((int)listspaths[0].size(), 50);
  858. const float scale_range_lower = 0.5f;
  859. const float scale_range_upper = 2.0f;
  860. const int search_steps = 100;
  861. for (int i = 0; i < conv_layer_count; i++)
  862. {
  863. ncnn::Mat& weight_scale = weight_scales[i];
  864. ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  865. const ncnn::Layer* layer = layers[conv_layers[i]];
  866. // search weight scale
  867. for (int j = 0; j < weight_scale.w; j++)
  868. {
  869. const float scale = weight_scale[j];
  870. const float scale_lower = scale * scale_range_lower;
  871. const float scale_upper = scale * scale_range_upper;
  872. const float scale_step = (scale_upper - scale_lower) / search_steps;
  873. std::vector<double> avgsims(search_steps, 0.0);
  874. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  875. for (int ii = 0; ii < image_count; ii++)
  876. {
  877. if (ii % 100 == 0)
  878. {
  879. fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, weight_scale.w, i, conv_layer_count);
  880. }
  881. ncnn::Extractor ex = create_extractor();
  882. ex.set_light_mode(true);
  883. const int thread_num = ncnn::get_omp_thread_num();
  884. ex.set_blob_allocator(&blob_allocators[thread_num]);
  885. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  886. for (int jj = 0; jj < input_blob_count; jj++)
  887. {
  888. const int type_to_pixel = type_to_pixels[jj];
  889. const std::vector<float>& mean_vals = means[jj];
  890. const std::vector<float>& norm_vals = norms[jj];
  891. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  892. if (type_to_pixel != pixel_convert_type)
  893. {
  894. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  895. }
  896. ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
  897. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  898. ex.input(input_blobs[jj], in);
  899. }
  900. ncnn::Mat in;
  901. ex.extract(conv_bottom_blobs[i], in);
  902. ncnn::Mat out;
  903. ex.extract(conv_top_blobs[i], out);
  904. ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
  905. ncnn::ParamDict pd;
  906. get_layer_param(layer, pd);
  907. pd.set(8, 1); //int8_scale_term
  908. layer_int8->load_param(pd);
  909. std::vector<float> sims(search_steps);
  910. for (int k = 0; k < search_steps; k++)
  911. {
  912. ncnn::Mat new_weight_scale = weight_scale.clone();
  913. new_weight_scale[j] = scale_lower + k * scale_step;
  914. std::vector<ncnn::Mat> weights;
  915. get_layer_weights(layer, weights);
  916. weights.push_back(new_weight_scale);
  917. weights.push_back(bottom_blob_scale);
  918. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  919. ncnn::Option opt_int8;
  920. opt_int8.use_packing_layout = false;
  921. layer_int8->create_pipeline(opt_int8);
  922. ncnn::Mat out_int8;
  923. layer_int8->forward(in, out_int8, opt_int8);
  924. layer_int8->destroy_pipeline(opt_int8);
  925. sims[k] = cosine_similarity(out, out_int8);
  926. }
  927. delete layer_int8;
  928. #pragma omp critical
  929. {
  930. for (int k = 0; k < search_steps; k++)
  931. {
  932. avgsims[k] += sims[k];
  933. }
  934. }
  935. }
  936. double max_avgsim = 0.0;
  937. float new_scale = scale;
  938. // find the scale with min cosine distance
  939. for (int k = 0; k < search_steps; k++)
  940. {
  941. if (max_avgsim < avgsims[k])
  942. {
  943. max_avgsim = avgsims[k];
  944. new_scale = scale_lower + k * scale_step;
  945. }
  946. }
  947. fprintf(stderr, "%s w %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  948. weight_scale[j] = new_scale;
  949. }
  950. // search bottom blob scale
  951. for (int j = 0; j < bottom_blob_scale.w; j++)
  952. {
  953. const float scale = bottom_blob_scale[j];
  954. const float scale_lower = scale * scale_range_lower;
  955. const float scale_upper = scale * scale_range_upper;
  956. const float scale_step = (scale_upper - scale_lower) / search_steps;
  957. std::vector<double> avgsims(search_steps, 0.0);
  958. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  959. for (int ii = 0; ii < image_count; ii++)
  960. {
  961. if (ii % 100 == 0)
  962. {
  963. fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, bottom_blob_scale.w, i, conv_layer_count);
  964. }
  965. ncnn::Extractor ex = create_extractor();
  966. ex.set_light_mode(true);
  967. const int thread_num = ncnn::get_omp_thread_num();
  968. ex.set_blob_allocator(&blob_allocators[thread_num]);
  969. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  970. for (int jj = 0; jj < input_blob_count; jj++)
  971. {
  972. const int type_to_pixel = type_to_pixels[jj];
  973. const std::vector<float>& mean_vals = means[jj];
  974. const std::vector<float>& norm_vals = norms[jj];
  975. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  976. if (type_to_pixel != pixel_convert_type)
  977. {
  978. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  979. }
  980. ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
  981. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  982. ex.input(input_blobs[jj], in);
  983. }
  984. ncnn::Mat in;
  985. ex.extract(conv_bottom_blobs[i], in);
  986. ncnn::Mat out;
  987. ex.extract(conv_top_blobs[i], out);
  988. ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
  989. ncnn::ParamDict pd;
  990. get_layer_param(layer, pd);
  991. pd.set(8, 1); //int8_scale_term
  992. layer_int8->load_param(pd);
  993. std::vector<float> sims(search_steps);
  994. for (int k = 0; k < search_steps; k++)
  995. {
  996. ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
  997. new_bottom_blob_scale[j] = scale_lower + k * scale_step;
  998. std::vector<ncnn::Mat> weights;
  999. get_layer_weights(layer, weights);
  1000. weights.push_back(weight_scale);
  1001. weights.push_back(new_bottom_blob_scale);
  1002. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  1003. ncnn::Option opt_int8;
  1004. opt_int8.use_packing_layout = false;
  1005. layer_int8->create_pipeline(opt_int8);
  1006. ncnn::Mat out_int8;
  1007. layer_int8->forward(in, out_int8, opt_int8);
  1008. layer_int8->destroy_pipeline(opt_int8);
  1009. sims[k] = cosine_similarity(out, out_int8);
  1010. }
  1011. delete layer_int8;
  1012. #pragma omp critical
  1013. {
  1014. for (int k = 0; k < search_steps; k++)
  1015. {
  1016. avgsims[k] += sims[k];
  1017. }
  1018. }
  1019. }
  1020. double max_avgsim = 0.0;
  1021. float new_scale = scale;
  1022. // find the scale with min cosine distance
  1023. for (int k = 0; k < search_steps; k++)
  1024. {
  1025. if (max_avgsim < avgsims[k])
  1026. {
  1027. max_avgsim = avgsims[k];
  1028. new_scale = scale_lower + k * scale_step;
  1029. }
  1030. }
  1031. fprintf(stderr, "%s b %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  1032. bottom_blob_scale[j] = new_scale;
  1033. }
  1034. // update quant info
  1035. QuantBlobStat& stat = quant_blob_stats[i];
  1036. stat.threshold = 127 / bottom_blob_scale[0];
  1037. }
  1038. return 0;
  1039. }
  1040. static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
  1041. {
  1042. std::vector<std::vector<std::string> > aps;
  1043. char* pch = strtok(s, ",");
  1044. while (pch != NULL)
  1045. {
  1046. FILE* fp = fopen(pch, "rb");
  1047. if (!fp)
  1048. {
  1049. fprintf(stderr, "fopen %s failed\n", pch);
  1050. break;
  1051. }
  1052. std::vector<std::string> paths;
  1053. // one filepath per line
  1054. char line[1024];
  1055. while (!feof(fp))
  1056. {
  1057. char* ss = fgets(line, 1024, fp);
  1058. if (!ss)
  1059. break;
  1060. char filepath[256];
  1061. int nscan = sscanf(line, "%255s", filepath);
  1062. if (nscan != 1)
  1063. continue;
  1064. paths.push_back(std::string(filepath));
  1065. }
  1066. fclose(fp);
  1067. aps.push_back(paths);
  1068. pch = strtok(NULL, ",");
  1069. }
  1070. return aps;
  1071. }
  1072. static float vstr_to_float(const char vstr[20])
  1073. {
  1074. double v = 0.0;
  1075. const char* p = vstr;
  1076. // sign
  1077. bool sign = *p != '-';
  1078. if (*p == '+' || *p == '-')
  1079. {
  1080. p++;
  1081. }
  1082. // digits before decimal point or exponent
  1083. uint64_t v1 = 0;
  1084. while (isdigit(*p))
  1085. {
  1086. v1 = v1 * 10 + (*p - '0');
  1087. p++;
  1088. }
  1089. v = (double)v1;
  1090. // digits after decimal point
  1091. if (*p == '.')
  1092. {
  1093. p++;
  1094. uint64_t pow10 = 1;
  1095. uint64_t v2 = 0;
  1096. while (isdigit(*p))
  1097. {
  1098. v2 = v2 * 10 + (*p - '0');
  1099. pow10 *= 10;
  1100. p++;
  1101. }
  1102. v += v2 / (double)pow10;
  1103. }
  1104. // exponent
  1105. if (*p == 'e' || *p == 'E')
  1106. {
  1107. p++;
  1108. // sign of exponent
  1109. bool fact = *p != '-';
  1110. if (*p == '+' || *p == '-')
  1111. {
  1112. p++;
  1113. }
  1114. // digits of exponent
  1115. uint64_t expon = 0;
  1116. while (isdigit(*p))
  1117. {
  1118. expon = expon * 10 + (*p - '0');
  1119. p++;
  1120. }
  1121. double scale = 1.0;
  1122. while (expon >= 8)
  1123. {
  1124. scale *= 1e8;
  1125. expon -= 8;
  1126. }
  1127. while (expon > 0)
  1128. {
  1129. scale *= 10.0;
  1130. expon -= 1;
  1131. }
  1132. v = fact ? v * scale : v / scale;
  1133. }
  1134. // fprintf(stderr, "v = %f\n", v);
  1135. return sign ? (float)v : (float)-v;
  1136. }
  1137. static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
  1138. {
  1139. std::vector<std::vector<float> > aaf;
  1140. char* pch = strtok(s, "[]");
  1141. while (pch != NULL)
  1142. {
  1143. // parse a,b,c
  1144. char vstr[20];
  1145. int nconsumed = 0;
  1146. int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
  1147. if (nscan == 1)
  1148. {
  1149. // ok we get array
  1150. pch += nconsumed;
  1151. std::vector<float> af;
  1152. float v = vstr_to_float(vstr);
  1153. af.push_back(v);
  1154. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1155. while (nscan == 1)
  1156. {
  1157. pch += nconsumed;
  1158. float v = vstr_to_float(vstr);
  1159. af.push_back(v);
  1160. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1161. }
  1162. // array end
  1163. aaf.push_back(af);
  1164. }
  1165. pch = strtok(NULL, "[]");
  1166. }
  1167. return aaf;
  1168. }
  1169. static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
  1170. {
  1171. std::vector<std::vector<int> > aai;
  1172. char* pch = strtok(s, "[]");
  1173. while (pch != NULL)
  1174. {
  1175. // parse a,b,c
  1176. int v;
  1177. int nconsumed = 0;
  1178. int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
  1179. if (nscan == 1)
  1180. {
  1181. // ok we get array
  1182. pch += nconsumed;
  1183. std::vector<int> ai;
  1184. ai.push_back(v);
  1185. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1186. while (nscan == 1)
  1187. {
  1188. pch += nconsumed;
  1189. ai.push_back(v);
  1190. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1191. }
  1192. // array end
  1193. aai.push_back(ai);
  1194. }
  1195. pch = strtok(NULL, "[]");
  1196. }
  1197. return aai;
  1198. }
  1199. static std::vector<int> parse_comma_pixel_type_list(char* s)
  1200. {
  1201. std::vector<int> aps;
  1202. char* pch = strtok(s, ",");
  1203. while (pch != NULL)
  1204. {
  1205. // RAW/RGB/BGR/GRAY/RGBA/BGRA
  1206. if (strcmp(pch, "RAW") == 0)
  1207. aps.push_back(-233);
  1208. if (strcmp(pch, "RGB") == 0)
  1209. aps.push_back(ncnn::Mat::PIXEL_RGB);
  1210. if (strcmp(pch, "BGR") == 0)
  1211. aps.push_back(ncnn::Mat::PIXEL_BGR);
  1212. if (strcmp(pch, "GRAY") == 0)
  1213. aps.push_back(ncnn::Mat::PIXEL_GRAY);
  1214. if (strcmp(pch, "RGBA") == 0)
  1215. aps.push_back(ncnn::Mat::PIXEL_RGBA);
  1216. if (strcmp(pch, "BGRA") == 0)
  1217. aps.push_back(ncnn::Mat::PIXEL_BGRA);
  1218. pch = strtok(NULL, ",");
  1219. }
  1220. return aps;
  1221. }
  1222. static void print_float_array_list(const std::vector<std::vector<float> >& list)
  1223. {
  1224. for (size_t i = 0; i < list.size(); i++)
  1225. {
  1226. const std::vector<float>& array = list[i];
  1227. fprintf(stderr, "[");
  1228. for (size_t j = 0; j < array.size(); j++)
  1229. {
  1230. fprintf(stderr, "%f", array[j]);
  1231. if (j != array.size() - 1)
  1232. fprintf(stderr, ",");
  1233. }
  1234. fprintf(stderr, "]");
  1235. if (i != list.size() - 1)
  1236. fprintf(stderr, ",");
  1237. }
  1238. }
  1239. static void print_int_array_list(const std::vector<std::vector<int> >& list)
  1240. {
  1241. for (size_t i = 0; i < list.size(); i++)
  1242. {
  1243. const std::vector<int>& array = list[i];
  1244. fprintf(stderr, "[");
  1245. for (size_t j = 0; j < array.size(); j++)
  1246. {
  1247. fprintf(stderr, "%d", array[j]);
  1248. if (j != array.size() - 1)
  1249. fprintf(stderr, ",");
  1250. }
  1251. fprintf(stderr, "]");
  1252. if (i != list.size() - 1)
  1253. fprintf(stderr, ",");
  1254. }
  1255. }
  1256. static void print_pixel_type_list(const std::vector<int>& list)
  1257. {
  1258. for (size_t i = 0; i < list.size(); i++)
  1259. {
  1260. const int type = list[i];
  1261. if (type == -233)
  1262. fprintf(stderr, "RAW");
  1263. if (type == ncnn::Mat::PIXEL_RGB)
  1264. fprintf(stderr, "RGB");
  1265. if (type == ncnn::Mat::PIXEL_BGR)
  1266. fprintf(stderr, "BGR");
  1267. if (type == ncnn::Mat::PIXEL_GRAY)
  1268. fprintf(stderr, "GRAY");
  1269. if (type == ncnn::Mat::PIXEL_RGBA)
  1270. fprintf(stderr, "RGBA");
  1271. if (type == ncnn::Mat::PIXEL_BGRA)
  1272. fprintf(stderr, "BGRA");
  1273. if (i != list.size() - 1)
  1274. fprintf(stderr, ",");
  1275. }
  1276. }
  1277. static void show_usage()
  1278. {
  1279. fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
  1280. fprintf(stderr, " mean=[104.0,117.0,123.0],...\n");
  1281. fprintf(stderr, " norm=[1.0,1.0,1.0],...\n");
  1282. fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n");
  1283. fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
  1284. fprintf(stderr, " thread=8\n");
  1285. fprintf(stderr, " method=kl/aciq/eq\n");
  1286. fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
  1287. }
  1288. int main(int argc, char** argv)
  1289. {
  1290. if (argc < 5)
  1291. {
  1292. show_usage();
  1293. return -1;
  1294. }
  1295. for (int i = 1; i < argc; i++)
  1296. {
  1297. if (argv[i][0] == '-')
  1298. {
  1299. show_usage();
  1300. return -1;
  1301. }
  1302. }
  1303. const char* inparam = argv[1];
  1304. const char* inbin = argv[2];
  1305. char* lists = argv[3];
  1306. const char* outtable = argv[4];
  1307. ncnn::Option opt;
  1308. opt.num_threads = 1;
  1309. opt.lightmode = false;
  1310. opt.use_fp16_packed = false;
  1311. opt.use_fp16_storage = false;
  1312. opt.use_fp16_arithmetic = false;
  1313. QuantNet net;
  1314. net.opt = opt;
  1315. net.load_param(inparam);
  1316. net.load_model(inbin);
  1317. net.init();
  1318. // load lists
  1319. net.listspaths = parse_comma_path_list(lists);
  1320. std::string method = "kl";
  1321. for (int i = 5; i < argc; i++)
  1322. {
  1323. // key=value
  1324. char* kv = argv[i];
  1325. char* eqs = strchr(kv, '=');
  1326. if (eqs == NULL)
  1327. {
  1328. fprintf(stderr, "unrecognized arg %s\n", kv);
  1329. continue;
  1330. }
  1331. // split k v
  1332. eqs[0] = '\0';
  1333. const char* key = kv;
  1334. char* value = eqs + 1;
  1335. // load mean norm shape
  1336. if (memcmp(key, "mean", 4) == 0)
  1337. net.means = parse_comma_float_array_list(value);
  1338. if (memcmp(key, "norm", 4) == 0)
  1339. net.norms = parse_comma_float_array_list(value);
  1340. if (memcmp(key, "shape", 5) == 0)
  1341. net.shapes = parse_comma_int_array_list(value);
  1342. if (memcmp(key, "pixel", 5) == 0)
  1343. net.type_to_pixels = parse_comma_pixel_type_list(value);
  1344. if (memcmp(key, "thread", 6) == 0)
  1345. net.quantize_num_threads = atoi(value);
  1346. if (memcmp(key, "method", 6) == 0)
  1347. method = std::string(value);
  1348. }
  1349. // sanity check
  1350. const size_t input_blob_count = net.input_blobs.size();
  1351. if (net.listspaths.size() != input_blob_count)
  1352. {
  1353. fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
  1354. return -1;
  1355. }
  1356. if (net.means.size() != input_blob_count)
  1357. {
  1358. fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
  1359. return -1;
  1360. }
  1361. if (net.norms.size() != input_blob_count)
  1362. {
  1363. fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
  1364. return -1;
  1365. }
  1366. if (net.shapes.size() != input_blob_count)
  1367. {
  1368. fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
  1369. return -1;
  1370. }
  1371. if (net.type_to_pixels.size() != input_blob_count)
  1372. {
  1373. fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
  1374. return -1;
  1375. }
  1376. if (net.quantize_num_threads < 0)
  1377. {
  1378. fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
  1379. return -1;
  1380. }
  1381. // print quantnet config
  1382. {
  1383. fprintf(stderr, "mean = ");
  1384. print_float_array_list(net.means);
  1385. fprintf(stderr, "\n");
  1386. fprintf(stderr, "norm = ");
  1387. print_float_array_list(net.norms);
  1388. fprintf(stderr, "\n");
  1389. fprintf(stderr, "shape = ");
  1390. print_int_array_list(net.shapes);
  1391. fprintf(stderr, "\n");
  1392. fprintf(stderr, "pixel = ");
  1393. print_pixel_type_list(net.type_to_pixels);
  1394. fprintf(stderr, "\n");
  1395. fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
  1396. fprintf(stderr, "method = %s\n", method.c_str());
  1397. fprintf(stderr, "---------------------------------------\n");
  1398. }
  1399. if (method == "kl")
  1400. {
  1401. net.quantize_KL();
  1402. }
  1403. else if (method == "aciq")
  1404. {
  1405. net.quantize_ACIQ();
  1406. }
  1407. else if (method == "eq")
  1408. {
  1409. net.quantize_EQ();
  1410. }
  1411. else
  1412. {
  1413. fprintf(stderr, "not implemented yet !\n");
  1414. fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
  1415. return -1;
  1416. }
  1417. net.print_quant_info();
  1418. net.save_table(outtable);
  1419. return 0;
  1420. }