You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ncnn2table.cpp 59 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // author:BUG1989 (https://github.com/BUG1989/) Long-term support.
  4. // author:JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
  5. //
  6. // Copyright (C) 2019 BUG1989. All rights reserved.
  7. // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
  8. //
  9. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  10. // in compliance with the License. You may obtain a copy of the License at
  11. //
  12. // https://opensource.org/licenses/BSD-3-Clause
  13. //
  14. // Unless required by applicable law or agreed to in writing, software distributed
  15. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  16. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  17. // specific language governing permissions and limitations under the License.
  18. #ifdef _MSC_VER
  19. #define _CRT_SECURE_NO_DEPRECATE
  20. #endif
  21. #include <float.h>
  22. #include <limits.h>
  23. #include <math.h>
  24. #include <stdio.h>
  25. #include <stdint.h>
  26. #include <stdlib.h>
  27. #include <string.h>
  28. #if defined(USE_NCNN_SIMPLEOCV)
  29. #include "simpleocv.h"
  30. #elif defined(USE_LOCAL_IMREADWRITE)
  31. #include "imreadwrite.h"
  32. #else
  33. #include <opencv2/core/core.hpp>
  34. #include <opencv2/highgui/highgui.hpp>
  35. #endif
  36. #include <string>
  37. #include <vector>
  38. // npy format header
  39. #include "npy.hpp"
  40. // ncnn public header
  41. #include "benchmark.h"
  42. #include "cpu.h"
  43. #include "net.h"
  44. // ncnn private header
  45. #include "layer/convolution.h"
  46. #include "layer/convolutiondepthwise.h"
  47. #include "layer/innerproduct.h"
  48. class QuantBlobStat
  49. {
  50. public:
  51. QuantBlobStat()
  52. {
  53. threshold = 0.f;
  54. absmax = 0.f;
  55. total = 0;
  56. }
  57. public:
  58. float threshold;
  59. float absmax;
  60. // ACIQ
  61. int total;
  62. // KL
  63. std::vector<uint64_t> histogram;
  64. std::vector<float> histogram_normed;
  65. };
  66. class QuantNet : public ncnn::Net
  67. {
  68. public:
  69. QuantNet();
  70. std::vector<ncnn::Blob>& blobs;
  71. std::vector<ncnn::Layer*>& layers;
  72. public:
  73. std::vector<std::vector<std::string> > listspaths;
  74. std::vector<std::vector<float> > means;
  75. std::vector<std::vector<float> > norms;
  76. std::vector<std::vector<int> > shapes;
  77. std::vector<int> type_to_pixels;
  78. int quantize_num_threads;
  79. int file_type;
  80. public:
  81. int init();
  82. void print_quant_info() const;
  83. int save_table(const char* tablepath);
  84. int quantize_KL();
  85. int quantize_ACIQ();
  86. int quantize_EQ();
  87. public:
  88. std::vector<int> input_blobs;
  89. std::vector<int> conv_layers;
  90. std::vector<int> conv_bottom_blobs;
  91. std::vector<int> conv_top_blobs;
  92. // result
  93. std::vector<QuantBlobStat> quant_blob_stats;
  94. std::vector<ncnn::Mat> weight_scales;
  95. std::vector<ncnn::Mat> bottom_blob_scales;
  96. };
  97. QuantNet::QuantNet()
  98. : blobs(mutable_blobs()), layers(mutable_layers())
  99. {
  100. quantize_num_threads = ncnn::get_cpu_count();
  101. }
  102. int QuantNet::init()
  103. {
  104. // find all input layers
  105. for (int i = 0; i < (int)layers.size(); i++)
  106. {
  107. const ncnn::Layer* layer = layers[i];
  108. if (layer->type == "Input")
  109. {
  110. input_blobs.push_back(layer->tops[0]);
  111. }
  112. }
  113. // find all conv layers
  114. for (int i = 0; i < (int)layers.size(); i++)
  115. {
  116. const ncnn::Layer* layer = layers[i];
  117. if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct")
  118. {
  119. conv_layers.push_back(i);
  120. conv_bottom_blobs.push_back(layer->bottoms[0]);
  121. conv_top_blobs.push_back(layer->tops[0]);
  122. }
  123. }
  124. const int conv_layer_count = (int)conv_layers.size();
  125. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  126. quant_blob_stats.resize(conv_bottom_blob_count);
  127. weight_scales.resize(conv_layer_count);
  128. bottom_blob_scales.resize(conv_bottom_blob_count);
  129. return 0;
  130. }
  131. int QuantNet::save_table(const char* tablepath)
  132. {
  133. FILE* fp = fopen(tablepath, "wb");
  134. if (!fp)
  135. {
  136. fprintf(stderr, "fopen %s failed\n", tablepath);
  137. return -1;
  138. }
  139. const int conv_layer_count = (int)conv_layers.size();
  140. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  141. fprintf(stdout, "param:%d\n", conv_layer_count);
  142. for (int i = 0; i < conv_layer_count; i++)
  143. {
  144. const ncnn::Mat& weight_scale = weight_scales[i];
  145. fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
  146. for (int j = 0; j < weight_scale.w; j++)
  147. {
  148. fprintf(fp, "%f ", weight_scale[j]);
  149. }
  150. fprintf(fp, "\n");
  151. }
  152. for (int i = 0; i < conv_bottom_blob_count; i++)
  153. {
  154. const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  155. fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
  156. for (int j = 0; j < bottom_blob_scale.w; j++)
  157. {
  158. fprintf(fp, "%f ", bottom_blob_scale[j]);
  159. }
  160. fprintf(fp, "\n");
  161. }
  162. fclose(fp);
  163. fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");
  164. return 0;
  165. }
  166. void QuantNet::print_quant_info() const
  167. {
  168. for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
  169. {
  170. const QuantBlobStat& stat = quant_blob_stats[i];
  171. float scale = 127 / stat.threshold;
  172. fprintf(stderr, "%-40s : max = %-15f threshold = %-15f scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
  173. }
  174. }
  175. /**
  176. * Read npy file
  177. * shape is input as [w,h,...]
  178. * @return ncnn::Mat
  179. */
  180. inline ncnn::Mat read_npy(const std::vector<int>& shape, const std::string& npypath)
  181. {
  182. npy::npy_data<float> d;
  183. try
  184. {
  185. d = npy::read_npy<float>(npypath);
  186. }
  187. catch (const std::exception& e)
  188. {
  189. fprintf(stderr, "npy::read_npy exception: %s\n", e.what());
  190. std::exit(EXIT_FAILURE);
  191. }
  192. std::vector<unsigned long> npy_shape = d.shape;
  193. size_t dims = shape.size();
  194. if (dims != npy_shape.size())
  195. {
  196. fprintf(stderr, "expect %d dims, but got: %d\n", (int)dims, (int)npy_shape.size());
  197. std::exit(EXIT_FAILURE);
  198. }
  199. for (size_t i = 0; i < dims; ++i)
  200. {
  201. if (static_cast<unsigned long>(shape[i]) != npy_shape[dims - 1 - i])
  202. {
  203. fprintf(stderr, "shape mismatch!\n");
  204. std::exit(EXIT_FAILURE);
  205. }
  206. }
  207. switch (dims)
  208. {
  209. case 1:
  210. return ncnn::Mat(shape[0], (void*)(d.data.data())).reshape(shape[0]).clone();
  211. case 2:
  212. return ncnn::Mat(shape[0] * shape[1], (void*)(d.data.data())).reshape(shape[0], shape[1]).clone();
  213. case 3:
  214. return ncnn::Mat(shape[0] * shape[1] * shape[2], (void*)(d.data.data())).reshape(shape[0], shape[1], shape[2]).clone();
  215. case 4:
  216. return ncnn::Mat(shape[0] * shape[1] * shape[2] * shape[3], (void*)(d.data.data())).reshape(shape[0], shape[1], shape[2], shape[3]).clone();
  217. default:
  218. fprintf(stderr, "dims:%d illegal!", (int)dims);
  219. return ncnn::Mat();
  220. }
  221. }
  222. /**
  223. * Read and resize image
  224. * shape is input as [w,h,...]
  225. * if w and h both are given, image will be resized to exactly size.
  226. * if w and h both are zero or negative, image will not be resized.
  227. * if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
  228. * if only w is zero or negative, image's height will scaled resize to h
  229. * @return ncnn::Mat
  230. */
  231. inline ncnn::Mat read_and_resize_image(const std::vector<int>& shape, const std::string& imagepath, int pixel_convert_type)
  232. {
  233. int target_w = shape[0];
  234. int target_h = shape[1];
  235. cv::Mat bgr = cv::imread(imagepath, 1);
  236. if (target_h <= 0 && target_w <= 0)
  237. {
  238. return ncnn::Mat::from_pixels(bgr.data, pixel_convert_type, bgr.cols, bgr.rows);
  239. }
  240. if (target_h <= 0 || target_w <= 0)
  241. {
  242. float scale = 1.0;
  243. if (target_h <= 0)
  244. {
  245. scale = 1.0 * bgr.cols / target_w;
  246. target_h = int(1.0 * bgr.rows / scale);
  247. }
  248. if (target_w <= 0)
  249. {
  250. scale = 1.0 * bgr.rows / target_h;
  251. target_w = int(1.0 * bgr.cols / scale);
  252. }
  253. }
  254. return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
  255. }
  256. static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
  257. {
  258. const size_t length = a.size();
  259. float result = 0;
  260. for (size_t i = 0; i < length; i++)
  261. {
  262. result += a[i] * log(a[i] / b[i]);
  263. }
  264. return result;
  265. }
  266. int QuantNet::quantize_KL()
  267. {
  268. const int input_blob_count = (int)input_blobs.size();
  269. const int conv_layer_count = (int)conv_layers.size();
  270. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  271. const int file_count = (int)listspaths[0].size();
  272. const int num_histogram_bins = 2048;
  273. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  274. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  275. // initialize conv weight scales
  276. #pragma omp parallel for num_threads(quantize_num_threads)
  277. for (int i = 0; i < conv_layer_count; i++)
  278. {
  279. const ncnn::Layer* layer = layers[conv_layers[i]];
  280. if (layer->type == "Convolution")
  281. {
  282. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  283. const int num_output = convolution->num_output;
  284. const int kernel_w = convolution->kernel_w;
  285. const int kernel_h = convolution->kernel_h;
  286. const int dilation_w = convolution->dilation_w;
  287. const int dilation_h = convolution->dilation_h;
  288. const int stride_w = convolution->stride_w;
  289. const int stride_h = convolution->stride_h;
  290. const int weight_data_size_output = convolution->weight_data_size / num_output;
  291. // int8 winograd F43 needs weight data to use 6bit quantization
  292. // TODO proper condition for winograd 3x3 int8
  293. bool quant_6bit = false;
  294. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  295. quant_6bit = true;
  296. weight_scales[i].create(num_output);
  297. for (int n = 0; n < num_output; n++)
  298. {
  299. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  300. float absmax = 0.f;
  301. for (int k = 0; k < weight_data_size_output; k++)
  302. {
  303. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  304. }
  305. if (quant_6bit)
  306. {
  307. weight_scales[i][n] = 31 / absmax;
  308. }
  309. else
  310. {
  311. weight_scales[i][n] = 127 / absmax;
  312. }
  313. }
  314. }
  315. if (layer->type == "ConvolutionDepthWise")
  316. {
  317. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  318. const int group = convolutiondepthwise->group;
  319. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  320. std::vector<float> scales;
  321. weight_scales[i].create(group);
  322. for (int n = 0; n < group; n++)
  323. {
  324. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  325. float absmax = 0.f;
  326. for (int k = 0; k < weight_data_size_output; k++)
  327. {
  328. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  329. }
  330. weight_scales[i][n] = 127 / absmax;
  331. }
  332. }
  333. if (layer->type == "InnerProduct")
  334. {
  335. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  336. const int num_output = innerproduct->num_output;
  337. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  338. weight_scales[i].create(num_output);
  339. for (int n = 0; n < num_output; n++)
  340. {
  341. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  342. float absmax = 0.f;
  343. for (int k = 0; k < weight_data_size_output; k++)
  344. {
  345. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  346. }
  347. weight_scales[i][n] = 127 / absmax;
  348. }
  349. }
  350. }
  351. // count the absmax
  352. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  353. for (int i = 0; i < file_count; i++)
  354. {
  355. if (i % 100 == 0)
  356. {
  357. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / file_count, i, file_count);
  358. }
  359. ncnn::Extractor ex = create_extractor();
  360. ex.set_light_mode(true);
  361. const int thread_num = ncnn::get_omp_thread_num();
  362. ex.set_blob_allocator(&blob_allocators[thread_num]);
  363. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  364. for (int j = 0; j < input_blob_count; j++)
  365. {
  366. ncnn::Mat in;
  367. if (0 == file_type)
  368. {
  369. const int type_to_pixel = type_to_pixels[j];
  370. const std::vector<float>& mean_vals = means[j];
  371. const std::vector<float>& norm_vals = norms[j];
  372. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  373. if (type_to_pixel != pixel_convert_type)
  374. {
  375. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  376. }
  377. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  378. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  379. }
  380. else
  381. {
  382. in = read_npy(shapes[j], listspaths[j][i]);
  383. }
  384. ex.input(input_blobs[j], in);
  385. }
  386. for (int j = 0; j < conv_bottom_blob_count; j++)
  387. {
  388. ncnn::Mat out;
  389. ex.extract(conv_bottom_blobs[j], out);
  390. // count absmax
  391. {
  392. float absmax = 0.f;
  393. const int outc = out.c;
  394. const int outsize = out.w * out.h;
  395. for (int p = 0; p < outc; p++)
  396. {
  397. const float* ptr = out.channel(p);
  398. for (int k = 0; k < outsize; k++)
  399. {
  400. absmax = std::max(absmax, (float)fabs(ptr[k]));
  401. }
  402. }
  403. #pragma omp critical
  404. {
  405. QuantBlobStat& stat = quant_blob_stats[j];
  406. stat.absmax = std::max(stat.absmax, absmax);
  407. }
  408. }
  409. }
  410. }
  411. // initialize histogram
  412. #pragma omp parallel for num_threads(quantize_num_threads)
  413. for (int i = 0; i < conv_bottom_blob_count; i++)
  414. {
  415. QuantBlobStat& stat = quant_blob_stats[i];
  416. stat.histogram.resize(num_histogram_bins, 0);
  417. stat.histogram_normed.resize(num_histogram_bins, 0);
  418. }
  419. // build histogram
  420. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  421. for (int i = 0; i < file_count; i++)
  422. {
  423. if (i % 100 == 0)
  424. {
  425. fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / file_count, i, file_count);
  426. }
  427. ncnn::Extractor ex = create_extractor();
  428. ex.set_light_mode(true);
  429. const int thread_num = ncnn::get_omp_thread_num();
  430. ex.set_blob_allocator(&blob_allocators[thread_num]);
  431. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  432. for (int j = 0; j < input_blob_count; j++)
  433. {
  434. ncnn::Mat in;
  435. if (0 == file_type)
  436. {
  437. const int type_to_pixel = type_to_pixels[j];
  438. const std::vector<float>& mean_vals = means[j];
  439. const std::vector<float>& norm_vals = norms[j];
  440. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  441. if (type_to_pixel != pixel_convert_type)
  442. {
  443. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  444. }
  445. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  446. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  447. }
  448. else
  449. {
  450. in = read_npy(shapes[j], listspaths[j][i]);
  451. }
  452. ex.input(input_blobs[j], in);
  453. }
  454. for (int j = 0; j < conv_bottom_blob_count; j++)
  455. {
  456. ncnn::Mat out;
  457. ex.extract(conv_bottom_blobs[j], out);
  458. // count histogram bin
  459. {
  460. const float absmax = quant_blob_stats[j].absmax;
  461. std::vector<uint64_t> histogram(num_histogram_bins, 0);
  462. const int outc = out.c;
  463. const int outsize = out.w * out.h;
  464. for (int p = 0; p < outc; p++)
  465. {
  466. const float* ptr = out.channel(p);
  467. for (int k = 0; k < outsize; k++)
  468. {
  469. if (ptr[k] == 0.f)
  470. continue;
  471. const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));
  472. histogram[index] += 1;
  473. }
  474. }
  475. #pragma omp critical
  476. {
  477. QuantBlobStat& stat = quant_blob_stats[j];
  478. for (int k = 0; k < num_histogram_bins; k++)
  479. {
  480. stat.histogram[k] += histogram[k];
  481. }
  482. }
  483. }
  484. }
  485. }
  486. // using kld to find the best threshold value
  487. #pragma omp parallel for num_threads(quantize_num_threads)
  488. for (int i = 0; i < conv_bottom_blob_count; i++)
  489. {
  490. QuantBlobStat& stat = quant_blob_stats[i];
  491. // normalize histogram bin
  492. {
  493. uint64_t sum = 0;
  494. for (int j = 0; j < num_histogram_bins; j++)
  495. {
  496. sum += stat.histogram[j];
  497. }
  498. for (int j = 0; j < num_histogram_bins; j++)
  499. {
  500. stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
  501. }
  502. }
  503. const int target_bin = 128;
  504. int target_threshold = target_bin;
  505. float min_kl_divergence = FLT_MAX;
  506. for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
  507. {
  508. const float kl_eps = 0.0001f;
  509. std::vector<float> clip_distribution(threshold, kl_eps);
  510. {
  511. for (int j = 0; j < threshold; j++)
  512. {
  513. clip_distribution[j] += stat.histogram_normed[j];
  514. }
  515. for (int j = threshold; j < num_histogram_bins; j++)
  516. {
  517. clip_distribution[threshold - 1] += stat.histogram_normed[j];
  518. }
  519. }
  520. const float num_per_bin = (float)threshold / target_bin;
  521. std::vector<float> quantize_distribution(target_bin, 0.f);
  522. {
  523. {
  524. const float end = num_per_bin;
  525. const int right_lower = (int)floor(end);
  526. const float right_scale = end - right_lower;
  527. if (right_scale > 0)
  528. {
  529. quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
  530. }
  531. for (int k = 0; k < right_lower; k++)
  532. {
  533. quantize_distribution[0] += stat.histogram_normed[k];
  534. }
  535. quantize_distribution[0] /= right_lower + right_scale;
  536. }
  537. for (int j = 1; j < target_bin - 1; j++)
  538. {
  539. const float start = j * num_per_bin;
  540. const float end = (j + 1) * num_per_bin;
  541. const int left_upper = (int)ceil(start);
  542. const float left_scale = left_upper - start;
  543. const int right_lower = (int)floor(end);
  544. const float right_scale = end - right_lower;
  545. if (left_scale > 0)
  546. {
  547. quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
  548. }
  549. if (right_scale > 0)
  550. {
  551. quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
  552. }
  553. for (int k = left_upper; k < right_lower; k++)
  554. {
  555. quantize_distribution[j] += stat.histogram_normed[k];
  556. }
  557. quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
  558. }
  559. {
  560. const float start = threshold - num_per_bin;
  561. const int left_upper = (int)ceil(start);
  562. const float left_scale = left_upper - start;
  563. if (left_scale > 0)
  564. {
  565. quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
  566. }
  567. for (int k = left_upper; k < threshold; k++)
  568. {
  569. quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
  570. }
  571. quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
  572. }
  573. }
  574. std::vector<float> expand_distribution(threshold, kl_eps);
  575. {
  576. {
  577. const float end = num_per_bin;
  578. const int right_lower = (int)floor(end);
  579. const float right_scale = end - right_lower;
  580. if (right_scale > 0)
  581. {
  582. expand_distribution[right_lower] += right_scale * quantize_distribution[0];
  583. }
  584. for (int k = 0; k < right_lower; k++)
  585. {
  586. expand_distribution[k] += quantize_distribution[0];
  587. }
  588. }
  589. for (int j = 1; j < target_bin - 1; j++)
  590. {
  591. const float start = j * num_per_bin;
  592. const float end = (j + 1) * num_per_bin;
  593. const int left_upper = (int)ceil(start);
  594. const float left_scale = left_upper - start;
  595. const int right_lower = (int)floor(end);
  596. const float right_scale = end - right_lower;
  597. if (left_scale > 0)
  598. {
  599. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
  600. }
  601. if (right_scale > 0)
  602. {
  603. expand_distribution[right_lower] += right_scale * quantize_distribution[j];
  604. }
  605. for (int k = left_upper; k < right_lower; k++)
  606. {
  607. expand_distribution[k] += quantize_distribution[j];
  608. }
  609. }
  610. {
  611. const float start = threshold - num_per_bin;
  612. const int left_upper = (int)ceil(start);
  613. const float left_scale = left_upper - start;
  614. if (left_scale > 0)
  615. {
  616. expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
  617. }
  618. for (int k = left_upper; k < threshold; k++)
  619. {
  620. expand_distribution[k] += quantize_distribution[target_bin - 1];
  621. }
  622. }
  623. }
  624. // kl
  625. const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);
  626. // the best num of bin
  627. if (kl_divergence < min_kl_divergence)
  628. {
  629. min_kl_divergence = kl_divergence;
  630. target_threshold = threshold;
  631. }
  632. }
  633. stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
  634. float scale = 127 / stat.threshold;
  635. bottom_blob_scales[i].create(1);
  636. bottom_blob_scales[i][0] = scale;
  637. }
  638. return 0;
  639. }
  640. static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
  641. {
  642. const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
  643. const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));
  644. double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));
  645. return (float)(alpha_gaussian[num_bits - 1] * std);
  646. }
  647. int QuantNet::quantize_ACIQ()
  648. {
  649. const int input_blob_count = (int)input_blobs.size();
  650. const int conv_layer_count = (int)conv_layers.size();
  651. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  652. const int file_count = (int)listspaths[0].size();
  653. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  654. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  655. // initialize conv weight scales
  656. #pragma omp parallel for num_threads(quantize_num_threads)
  657. for (int i = 0; i < conv_layer_count; i++)
  658. {
  659. const ncnn::Layer* layer = layers[conv_layers[i]];
  660. if (layer->type == "Convolution")
  661. {
  662. const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
  663. const int num_output = convolution->num_output;
  664. const int kernel_w = convolution->kernel_w;
  665. const int kernel_h = convolution->kernel_h;
  666. const int dilation_w = convolution->dilation_w;
  667. const int dilation_h = convolution->dilation_h;
  668. const int stride_w = convolution->stride_w;
  669. const int stride_h = convolution->stride_h;
  670. const int weight_data_size_output = convolution->weight_data_size / num_output;
  671. // int8 winograd F43 needs weight data to use 6bit quantization
  672. // TODO proper condition for winograd 3x3 int8
  673. bool quant_6bit = false;
  674. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  675. quant_6bit = true;
  676. weight_scales[i].create(num_output);
  677. for (int n = 0; n < num_output; n++)
  678. {
  679. const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  680. float absmax = 0.f;
  681. for (int k = 0; k < weight_data_size_output; k++)
  682. {
  683. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  684. }
  685. if (quant_6bit)
  686. {
  687. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
  688. weight_scales[i][n] = 31 / threshold;
  689. }
  690. else
  691. {
  692. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  693. weight_scales[i][n] = 127 / threshold;
  694. }
  695. }
  696. }
  697. if (layer->type == "ConvolutionDepthWise")
  698. {
  699. const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
  700. const int group = convolutiondepthwise->group;
  701. const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
  702. std::vector<float> scales;
  703. weight_scales[i].create(group);
  704. for (int n = 0; n < group; n++)
  705. {
  706. const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  707. float absmax = 0.f;
  708. for (int k = 0; k < weight_data_size_output; k++)
  709. {
  710. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  711. }
  712. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  713. weight_scales[i][n] = 127 / threshold;
  714. }
  715. }
  716. if (layer->type == "InnerProduct")
  717. {
  718. const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
  719. const int num_output = innerproduct->num_output;
  720. const int weight_data_size_output = innerproduct->weight_data_size / num_output;
  721. weight_scales[i].create(num_output);
  722. for (int n = 0; n < num_output; n++)
  723. {
  724. const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
  725. float absmax = 0.f;
  726. for (int k = 0; k < weight_data_size_output; k++)
  727. {
  728. absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
  729. }
  730. const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
  731. weight_scales[i][n] = 127 / threshold;
  732. }
  733. }
  734. }
  735. // count the absmax
  736. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  737. for (int i = 0; i < file_count; i++)
  738. {
  739. if (i % 100 == 0)
  740. {
  741. fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / file_count, i, file_count);
  742. }
  743. ncnn::Extractor ex = create_extractor();
  744. ex.set_light_mode(true);
  745. const int thread_num = ncnn::get_omp_thread_num();
  746. ex.set_blob_allocator(&blob_allocators[thread_num]);
  747. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  748. for (int j = 0; j < input_blob_count; j++)
  749. {
  750. ncnn::Mat in;
  751. if (0 == file_type)
  752. {
  753. const int type_to_pixel = type_to_pixels[j];
  754. const std::vector<float>& mean_vals = means[j];
  755. const std::vector<float>& norm_vals = norms[j];
  756. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  757. if (type_to_pixel != pixel_convert_type)
  758. {
  759. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  760. }
  761. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  762. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  763. }
  764. else
  765. {
  766. in = read_npy(shapes[j], listspaths[j][i]);
  767. }
  768. ex.input(input_blobs[j], in);
  769. }
  770. for (int j = 0; j < conv_bottom_blob_count; j++)
  771. {
  772. ncnn::Mat out;
  773. ex.extract(conv_bottom_blobs[j], out);
  774. // count absmax
  775. {
  776. float absmax = 0.f;
  777. const int outc = out.c;
  778. const int outsize = out.w * out.h;
  779. for (int p = 0; p < outc; p++)
  780. {
  781. const float* ptr = out.channel(p);
  782. for (int k = 0; k < outsize; k++)
  783. {
  784. absmax = std::max(absmax, (float)fabs(ptr[k]));
  785. }
  786. }
  787. #pragma omp critical
  788. {
  789. QuantBlobStat& stat = quant_blob_stats[j];
  790. stat.absmax = std::max(stat.absmax, absmax);
  791. stat.total = outc * outsize;
  792. }
  793. }
  794. }
  795. }
  796. // alpha gaussian
  797. #pragma omp parallel for num_threads(quantize_num_threads)
  798. for (int i = 0; i < conv_bottom_blob_count; i++)
  799. {
  800. QuantBlobStat& stat = quant_blob_stats[i];
  801. stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
  802. float scale = 127 / stat.threshold;
  803. bottom_blob_scales[i].create(1);
  804. bottom_blob_scales[i][0] = scale;
  805. }
  806. return 0;
  807. }
  808. static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
  809. {
  810. const int chanenls = a.c;
  811. const int size = a.w * a.h;
  812. float sa = 0;
  813. float sb = 0;
  814. float sum = 0;
  815. for (int p = 0; p < chanenls; p++)
  816. {
  817. const float* pa = a.channel(p);
  818. const float* pb = b.channel(p);
  819. for (int i = 0; i < size; i++)
  820. {
  821. sa += pa[i] * pa[i];
  822. sb += pb[i] * pb[i];
  823. sum += pa[i] * pb[i];
  824. }
  825. }
  826. float sim = (float)sum / sqrt(sa) / sqrt(sb);
  827. return sim;
  828. }
  829. static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
  830. {
  831. if (layer->type == "Convolution")
  832. {
  833. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  834. pd.set(0, convolution->num_output);
  835. pd.set(1, convolution->kernel_w);
  836. pd.set(11, convolution->kernel_h);
  837. pd.set(2, convolution->dilation_w);
  838. pd.set(12, convolution->dilation_h);
  839. pd.set(3, convolution->stride_w);
  840. pd.set(13, convolution->stride_h);
  841. pd.set(4, convolution->pad_left);
  842. pd.set(15, convolution->pad_right);
  843. pd.set(14, convolution->pad_top);
  844. pd.set(16, convolution->pad_bottom);
  845. pd.set(18, convolution->pad_value);
  846. pd.set(5, convolution->bias_term);
  847. pd.set(6, convolution->weight_data_size);
  848. pd.set(8, convolution->int8_scale_term);
  849. pd.set(9, convolution->activation_type);
  850. pd.set(10, convolution->activation_params);
  851. }
  852. else if (layer->type == "ConvolutionDepthWise")
  853. {
  854. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  855. pd.set(0, convolutiondepthwise->num_output);
  856. pd.set(1, convolutiondepthwise->kernel_w);
  857. pd.set(11, convolutiondepthwise->kernel_h);
  858. pd.set(2, convolutiondepthwise->dilation_w);
  859. pd.set(12, convolutiondepthwise->dilation_h);
  860. pd.set(3, convolutiondepthwise->stride_w);
  861. pd.set(13, convolutiondepthwise->stride_h);
  862. pd.set(4, convolutiondepthwise->pad_left);
  863. pd.set(15, convolutiondepthwise->pad_right);
  864. pd.set(14, convolutiondepthwise->pad_top);
  865. pd.set(16, convolutiondepthwise->pad_bottom);
  866. pd.set(18, convolutiondepthwise->pad_value);
  867. pd.set(5, convolutiondepthwise->bias_term);
  868. pd.set(6, convolutiondepthwise->weight_data_size);
  869. pd.set(7, convolutiondepthwise->group);
  870. pd.set(8, convolutiondepthwise->int8_scale_term);
  871. pd.set(9, convolutiondepthwise->activation_type);
  872. pd.set(10, convolutiondepthwise->activation_params);
  873. }
  874. else if (layer->type == "InnerProduct")
  875. {
  876. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  877. pd.set(0, innerproduct->num_output);
  878. pd.set(1, innerproduct->bias_term);
  879. pd.set(2, innerproduct->weight_data_size);
  880. pd.set(8, innerproduct->int8_scale_term);
  881. pd.set(9, innerproduct->activation_type);
  882. pd.set(10, innerproduct->activation_params);
  883. }
  884. else
  885. {
  886. fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
  887. return -1;
  888. }
  889. return 0;
  890. }
  891. static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
  892. {
  893. if (layer->type == "Convolution")
  894. {
  895. ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
  896. weights.push_back(convolution->weight_data);
  897. if (convolution->bias_term)
  898. weights.push_back(convolution->bias_data);
  899. }
  900. else if (layer->type == "ConvolutionDepthWise")
  901. {
  902. ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
  903. weights.push_back(convolutiondepthwise->weight_data);
  904. if (convolutiondepthwise->bias_term)
  905. weights.push_back(convolutiondepthwise->bias_data);
  906. }
  907. else if (layer->type == "InnerProduct")
  908. {
  909. ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
  910. weights.push_back(innerproduct->weight_data);
  911. if (innerproduct->bias_term)
  912. weights.push_back(innerproduct->bias_data);
  913. }
  914. else
  915. {
  916. fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
  917. return -1;
  918. }
  919. return 0;
  920. }
  921. int QuantNet::quantize_EQ()
  922. {
  923. // find the initial scale via KL
  924. quantize_KL();
  925. print_quant_info();
  926. const int input_blob_count = (int)input_blobs.size();
  927. const int conv_layer_count = (int)conv_layers.size();
  928. const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
  929. std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
  930. std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
  931. // max 50 images for EQ
  932. const int file_count = std::min((int)listspaths[0].size(), 50);
  933. const float scale_range_lower = 0.5f;
  934. const float scale_range_upper = 2.0f;
  935. const int search_steps = 100;
  936. for (int i = 0; i < conv_layer_count; i++)
  937. {
  938. ncnn::Mat& weight_scale = weight_scales[i];
  939. ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
  940. const ncnn::Layer* layer = layers[conv_layers[i]];
  941. // search weight scale
  942. for (int j = 0; j < weight_scale.w; j++)
  943. {
  944. const float scale = weight_scale[j];
  945. const float scale_lower = scale * scale_range_lower;
  946. const float scale_upper = scale * scale_range_upper;
  947. const float scale_step = (scale_upper - scale_lower) / search_steps;
  948. std::vector<double> avgsims(search_steps, 0.0);
  949. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  950. for (int ii = 0; ii < file_count; ii++)
  951. {
  952. if (ii % 100 == 0)
  953. {
  954. fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / file_count, ii, file_count, j, weight_scale.w, i, conv_layer_count);
  955. }
  956. ncnn::Extractor ex = create_extractor();
  957. ex.set_light_mode(true);
  958. const int thread_num = ncnn::get_omp_thread_num();
  959. ex.set_blob_allocator(&blob_allocators[thread_num]);
  960. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  961. for (int jj = 0; jj < input_blob_count; jj++)
  962. {
  963. ncnn::Mat in;
  964. if (0 == file_type)
  965. {
  966. const int type_to_pixel = type_to_pixels[j];
  967. const std::vector<float>& mean_vals = means[j];
  968. const std::vector<float>& norm_vals = norms[j];
  969. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  970. if (type_to_pixel != pixel_convert_type)
  971. {
  972. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  973. }
  974. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  975. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  976. }
  977. else
  978. {
  979. in = read_npy(shapes[j], listspaths[j][i]);
  980. }
  981. ex.input(input_blobs[j], in);
  982. }
  983. ncnn::Mat in;
  984. ex.extract(conv_bottom_blobs[i], in);
  985. ncnn::Mat out;
  986. ex.extract(conv_top_blobs[i], out);
  987. ncnn::Layer* layer_int8 = ncnn::create_layer_cpu(layer->typeindex);
  988. ncnn::ParamDict pd;
  989. get_layer_param(layer, pd);
  990. pd.set(8, 1); //int8_scale_term
  991. layer_int8->load_param(pd);
  992. std::vector<float> sims(search_steps);
  993. for (int k = 0; k < search_steps; k++)
  994. {
  995. ncnn::Mat new_weight_scale = weight_scale.clone();
  996. new_weight_scale[j] = scale_lower + k * scale_step;
  997. std::vector<ncnn::Mat> weights;
  998. get_layer_weights(layer, weights);
  999. weights.push_back(new_weight_scale);
  1000. weights.push_back(bottom_blob_scale);
  1001. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  1002. ncnn::Option opt_int8;
  1003. opt_int8.use_packing_layout = false;
  1004. layer_int8->create_pipeline(opt_int8);
  1005. ncnn::Mat out_int8;
  1006. layer_int8->forward(in, out_int8, opt_int8);
  1007. layer_int8->destroy_pipeline(opt_int8);
  1008. sims[k] = cosine_similarity(out, out_int8);
  1009. }
  1010. delete layer_int8;
  1011. #pragma omp critical
  1012. {
  1013. for (int k = 0; k < search_steps; k++)
  1014. {
  1015. avgsims[k] += sims[k];
  1016. }
  1017. }
  1018. }
  1019. double max_avgsim = 0.0;
  1020. float new_scale = scale;
  1021. // find the scale with min cosine distance
  1022. for (int k = 0; k < search_steps; k++)
  1023. {
  1024. if (max_avgsim < avgsims[k])
  1025. {
  1026. max_avgsim = avgsims[k];
  1027. new_scale = scale_lower + k * scale_step;
  1028. }
  1029. }
  1030. fprintf(stderr, "%s w %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  1031. weight_scale[j] = new_scale;
  1032. }
  1033. // search bottom blob scale
  1034. for (int j = 0; j < bottom_blob_scale.w; j++)
  1035. {
  1036. const float scale = bottom_blob_scale[j];
  1037. const float scale_lower = scale * scale_range_lower;
  1038. const float scale_upper = scale * scale_range_upper;
  1039. const float scale_step = (scale_upper - scale_lower) / search_steps;
  1040. std::vector<double> avgsims(search_steps, 0.0);
  1041. #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
  1042. for (int ii = 0; ii < file_count; ii++)
  1043. {
  1044. if (ii % 100 == 0)
  1045. {
  1046. fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / file_count, ii, file_count, j, bottom_blob_scale.w, i, conv_layer_count);
  1047. }
  1048. ncnn::Extractor ex = create_extractor();
  1049. ex.set_light_mode(true);
  1050. const int thread_num = ncnn::get_omp_thread_num();
  1051. ex.set_blob_allocator(&blob_allocators[thread_num]);
  1052. ex.set_workspace_allocator(&workspace_allocators[thread_num]);
  1053. for (int jj = 0; jj < input_blob_count; jj++)
  1054. {
  1055. ncnn::Mat in;
  1056. if (0 == file_type)
  1057. {
  1058. const int type_to_pixel = type_to_pixels[j];
  1059. const std::vector<float>& mean_vals = means[j];
  1060. const std::vector<float>& norm_vals = norms[j];
  1061. int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
  1062. if (type_to_pixel != pixel_convert_type)
  1063. {
  1064. pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
  1065. }
  1066. in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
  1067. in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
  1068. }
  1069. else
  1070. {
  1071. in = read_npy(shapes[j], listspaths[j][i]);
  1072. }
  1073. ex.input(input_blobs[j], in);
  1074. }
  1075. ncnn::Mat in;
  1076. ex.extract(conv_bottom_blobs[i], in);
  1077. ncnn::Mat out;
  1078. ex.extract(conv_top_blobs[i], out);
  1079. ncnn::Layer* layer_int8 = ncnn::create_layer_cpu(layer->typeindex);
  1080. ncnn::ParamDict pd;
  1081. get_layer_param(layer, pd);
  1082. pd.set(8, 1); //int8_scale_term
  1083. layer_int8->load_param(pd);
  1084. std::vector<float> sims(search_steps);
  1085. for (int k = 0; k < search_steps; k++)
  1086. {
  1087. ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
  1088. new_bottom_blob_scale[j] = scale_lower + k * scale_step;
  1089. std::vector<ncnn::Mat> weights;
  1090. get_layer_weights(layer, weights);
  1091. weights.push_back(weight_scale);
  1092. weights.push_back(new_bottom_blob_scale);
  1093. layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
  1094. ncnn::Option opt_int8;
  1095. opt_int8.use_packing_layout = false;
  1096. layer_int8->create_pipeline(opt_int8);
  1097. ncnn::Mat out_int8;
  1098. layer_int8->forward(in, out_int8, opt_int8);
  1099. layer_int8->destroy_pipeline(opt_int8);
  1100. sims[k] = cosine_similarity(out, out_int8);
  1101. }
  1102. delete layer_int8;
  1103. #pragma omp critical
  1104. {
  1105. for (int k = 0; k < search_steps; k++)
  1106. {
  1107. avgsims[k] += sims[k];
  1108. }
  1109. }
  1110. }
  1111. double max_avgsim = 0.0;
  1112. float new_scale = scale;
  1113. // find the scale with min cosine distance
  1114. for (int k = 0; k < search_steps; k++)
  1115. {
  1116. if (max_avgsim < avgsims[k])
  1117. {
  1118. max_avgsim = avgsims[k];
  1119. new_scale = scale_lower + k * scale_step;
  1120. }
  1121. }
  1122. fprintf(stderr, "%s b %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
  1123. bottom_blob_scale[j] = new_scale;
  1124. }
  1125. // update quant info
  1126. QuantBlobStat& stat = quant_blob_stats[i];
  1127. stat.threshold = 127 / bottom_blob_scale[0];
  1128. }
  1129. return 0;
  1130. }
  1131. static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
  1132. {
  1133. std::vector<std::vector<std::string> > aps;
  1134. char* pch = strtok(s, ",");
  1135. while (pch != NULL)
  1136. {
  1137. FILE* fp = fopen(pch, "rb");
  1138. if (!fp)
  1139. {
  1140. fprintf(stderr, "fopen %s failed\n", pch);
  1141. break;
  1142. }
  1143. std::vector<std::string> paths;
  1144. // one filepath per line
  1145. char line[1024];
  1146. while (!feof(fp))
  1147. {
  1148. char* ss = fgets(line, 1024, fp);
  1149. if (!ss)
  1150. break;
  1151. char filepath[256];
  1152. int nscan = sscanf(line, "%255s", filepath);
  1153. if (nscan != 1)
  1154. continue;
  1155. paths.push_back(std::string(filepath));
  1156. }
  1157. fclose(fp);
  1158. aps.push_back(paths);
  1159. pch = strtok(NULL, ",");
  1160. }
  1161. return aps;
  1162. }
  1163. static float vstr_to_float(const char vstr[20])
  1164. {
  1165. double v = 0.0;
  1166. const char* p = vstr;
  1167. // sign
  1168. bool sign = *p != '-';
  1169. if (*p == '+' || *p == '-')
  1170. {
  1171. p++;
  1172. }
  1173. // digits before decimal point or exponent
  1174. uint64_t v1 = 0;
  1175. while (isdigit(*p))
  1176. {
  1177. v1 = v1 * 10 + (*p - '0');
  1178. p++;
  1179. }
  1180. v = (double)v1;
  1181. // digits after decimal point
  1182. if (*p == '.')
  1183. {
  1184. p++;
  1185. uint64_t pow10 = 1;
  1186. uint64_t v2 = 0;
  1187. while (isdigit(*p))
  1188. {
  1189. v2 = v2 * 10 + (*p - '0');
  1190. pow10 *= 10;
  1191. p++;
  1192. }
  1193. v += v2 / (double)pow10;
  1194. }
  1195. // exponent
  1196. if (*p == 'e' || *p == 'E')
  1197. {
  1198. p++;
  1199. // sign of exponent
  1200. bool fact = *p != '-';
  1201. if (*p == '+' || *p == '-')
  1202. {
  1203. p++;
  1204. }
  1205. // digits of exponent
  1206. uint64_t expon = 0;
  1207. while (isdigit(*p))
  1208. {
  1209. expon = expon * 10 + (*p - '0');
  1210. p++;
  1211. }
  1212. double scale = 1.0;
  1213. while (expon >= 8)
  1214. {
  1215. scale *= 1e8;
  1216. expon -= 8;
  1217. }
  1218. while (expon > 0)
  1219. {
  1220. scale *= 10.0;
  1221. expon -= 1;
  1222. }
  1223. v = fact ? v * scale : v / scale;
  1224. }
  1225. // fprintf(stderr, "v = %f\n", v);
  1226. return sign ? (float)v : (float)-v;
  1227. }
  1228. static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
  1229. {
  1230. std::vector<std::vector<float> > aaf;
  1231. char* pch = strtok(s, "[]");
  1232. while (pch != NULL)
  1233. {
  1234. // parse a,b,c
  1235. char vstr[20];
  1236. int nconsumed = 0;
  1237. int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
  1238. if (nscan == 1)
  1239. {
  1240. // ok we get array
  1241. pch += nconsumed;
  1242. std::vector<float> af;
  1243. float v = vstr_to_float(vstr);
  1244. af.push_back(v);
  1245. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1246. while (nscan == 1)
  1247. {
  1248. pch += nconsumed;
  1249. float v = vstr_to_float(vstr);
  1250. af.push_back(v);
  1251. nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
  1252. }
  1253. // array end
  1254. aaf.push_back(af);
  1255. }
  1256. pch = strtok(NULL, "[]");
  1257. }
  1258. return aaf;
  1259. }
  1260. static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
  1261. {
  1262. std::vector<std::vector<int> > aai;
  1263. char* pch = strtok(s, "[]");
  1264. while (pch != NULL)
  1265. {
  1266. // parse a,b,c
  1267. int v;
  1268. int nconsumed = 0;
  1269. int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
  1270. if (nscan == 1)
  1271. {
  1272. // ok we get array
  1273. pch += nconsumed;
  1274. std::vector<int> ai;
  1275. ai.push_back(v);
  1276. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1277. while (nscan == 1)
  1278. {
  1279. pch += nconsumed;
  1280. ai.push_back(v);
  1281. nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
  1282. }
  1283. // array end
  1284. aai.push_back(ai);
  1285. }
  1286. pch = strtok(NULL, "[]");
  1287. }
  1288. return aai;
  1289. }
  1290. static std::vector<int> parse_comma_pixel_type_list(char* s)
  1291. {
  1292. std::vector<int> aps;
  1293. char* pch = strtok(s, ",");
  1294. while (pch != NULL)
  1295. {
  1296. // RAW/RGB/BGR/GRAY/RGBA/BGRA
  1297. if (strcmp(pch, "RAW") == 0)
  1298. aps.push_back(-233);
  1299. if (strcmp(pch, "RGB") == 0)
  1300. aps.push_back(ncnn::Mat::PIXEL_RGB);
  1301. if (strcmp(pch, "BGR") == 0)
  1302. aps.push_back(ncnn::Mat::PIXEL_BGR);
  1303. if (strcmp(pch, "GRAY") == 0)
  1304. aps.push_back(ncnn::Mat::PIXEL_GRAY);
  1305. if (strcmp(pch, "RGBA") == 0)
  1306. aps.push_back(ncnn::Mat::PIXEL_RGBA);
  1307. if (strcmp(pch, "BGRA") == 0)
  1308. aps.push_back(ncnn::Mat::PIXEL_BGRA);
  1309. pch = strtok(NULL, ",");
  1310. }
  1311. return aps;
  1312. }
  1313. static void print_float_array_list(const std::vector<std::vector<float> >& list)
  1314. {
  1315. for (size_t i = 0; i < list.size(); i++)
  1316. {
  1317. const std::vector<float>& array = list[i];
  1318. fprintf(stderr, "[");
  1319. for (size_t j = 0; j < array.size(); j++)
  1320. {
  1321. fprintf(stderr, "%f", array[j]);
  1322. if (j != array.size() - 1)
  1323. fprintf(stderr, ",");
  1324. }
  1325. fprintf(stderr, "]");
  1326. if (i != list.size() - 1)
  1327. fprintf(stderr, ",");
  1328. }
  1329. }
  1330. static void print_int_array_list(const std::vector<std::vector<int> >& list)
  1331. {
  1332. for (size_t i = 0; i < list.size(); i++)
  1333. {
  1334. const std::vector<int>& array = list[i];
  1335. fprintf(stderr, "[");
  1336. for (size_t j = 0; j < array.size(); j++)
  1337. {
  1338. fprintf(stderr, "%d", array[j]);
  1339. if (j != array.size() - 1)
  1340. fprintf(stderr, ",");
  1341. }
  1342. fprintf(stderr, "]");
  1343. if (i != list.size() - 1)
  1344. fprintf(stderr, ",");
  1345. }
  1346. }
  1347. static void print_pixel_type_list(const std::vector<int>& list)
  1348. {
  1349. for (size_t i = 0; i < list.size(); i++)
  1350. {
  1351. const int type = list[i];
  1352. if (type == -233)
  1353. fprintf(stderr, "RAW");
  1354. if (type == ncnn::Mat::PIXEL_RGB)
  1355. fprintf(stderr, "RGB");
  1356. if (type == ncnn::Mat::PIXEL_BGR)
  1357. fprintf(stderr, "BGR");
  1358. if (type == ncnn::Mat::PIXEL_GRAY)
  1359. fprintf(stderr, "GRAY");
  1360. if (type == ncnn::Mat::PIXEL_RGBA)
  1361. fprintf(stderr, "RGBA");
  1362. if (type == ncnn::Mat::PIXEL_BGRA)
  1363. fprintf(stderr, "BGRA");
  1364. if (i != list.size() - 1)
  1365. fprintf(stderr, ",");
  1366. }
  1367. }
  1368. static void show_usage()
  1369. {
  1370. fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
  1371. fprintf(stderr, " mean=[104.0,117.0,123.0],...\n");
  1372. fprintf(stderr, " norm=[1.0,1.0,1.0],...\n");
  1373. fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n");
  1374. fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
  1375. fprintf(stderr, " thread=8\n");
  1376. fprintf(stderr, " method=kl/aciq/eq\n");
  1377. fprintf(stderr, " type=0/1, 0:image,1:npy\n");
  1378. fprintf(stderr, "Sample usage:\n");
  1379. fprintf(stderr, " ncnn2table squeezenet.param squeezenet.bin filelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
  1380. fprintf(stderr, " ncnn2table test.param test.bin filelist.txt squeezenet.table shape=[227,227,3] method=kl type=1\n");
  1381. }
  1382. int main(int argc, char** argv)
  1383. {
  1384. if (argc < 5)
  1385. {
  1386. show_usage();
  1387. return -1;
  1388. }
  1389. for (int i = 1; i < argc; i++)
  1390. {
  1391. if (argv[i][0] == '-')
  1392. {
  1393. show_usage();
  1394. return -1;
  1395. }
  1396. }
  1397. const char* inparam = argv[1];
  1398. const char* inbin = argv[2];
  1399. char* lists = argv[3];
  1400. const char* outtable = argv[4];
  1401. ncnn::Option opt;
  1402. opt.num_threads = 1;
  1403. opt.lightmode = false;
  1404. opt.use_fp16_packed = false;
  1405. opt.use_fp16_storage = false;
  1406. opt.use_fp16_arithmetic = false;
  1407. QuantNet net;
  1408. net.opt = opt;
  1409. net.load_param(inparam);
  1410. net.load_model(inbin);
  1411. net.init();
  1412. // load lists
  1413. net.listspaths = parse_comma_path_list(lists);
  1414. std::string method = "kl";
  1415. net.file_type = 0;
  1416. for (int i = 5; i < argc; i++)
  1417. {
  1418. // key=value
  1419. char* kv = argv[i];
  1420. char* eqs = strchr(kv, '=');
  1421. if (eqs == NULL)
  1422. {
  1423. fprintf(stderr, "unrecognized arg %s\n", kv);
  1424. continue;
  1425. }
  1426. // split k v
  1427. eqs[0] = '\0';
  1428. const char* key = kv;
  1429. char* value = eqs + 1;
  1430. // load mean norm shape
  1431. if (memcmp(key, "mean", 4) == 0)
  1432. net.means = parse_comma_float_array_list(value);
  1433. if (memcmp(key, "norm", 4) == 0)
  1434. net.norms = parse_comma_float_array_list(value);
  1435. if (memcmp(key, "shape", 5) == 0)
  1436. net.shapes = parse_comma_int_array_list(value);
  1437. if (memcmp(key, "pixel", 5) == 0)
  1438. net.type_to_pixels = parse_comma_pixel_type_list(value);
  1439. if (memcmp(key, "thread", 6) == 0)
  1440. net.quantize_num_threads = atoi(value);
  1441. if (memcmp(key, "method", 6) == 0)
  1442. method = std::string(value);
  1443. if (memcmp(key, "type", 4) == 0)
  1444. net.file_type = atoi(value);
  1445. }
  1446. // sanity check
  1447. const size_t input_blob_count = net.input_blobs.size();
  1448. if (net.listspaths.size() != input_blob_count)
  1449. {
  1450. fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
  1451. return -1;
  1452. }
  1453. if ((0 == net.file_type) && (net.means.size() != input_blob_count))
  1454. {
  1455. fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
  1456. return -1;
  1457. }
  1458. if ((0 == net.file_type) && (net.norms.size() != input_blob_count))
  1459. {
  1460. fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
  1461. return -1;
  1462. }
  1463. if (net.shapes.size() != input_blob_count)
  1464. {
  1465. fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
  1466. return -1;
  1467. }
  1468. if ((0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count))
  1469. {
  1470. fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
  1471. return -1;
  1472. }
  1473. if (net.quantize_num_threads < 0)
  1474. {
  1475. fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
  1476. return -1;
  1477. }
  1478. // print quantnet config
  1479. {
  1480. fprintf(stderr, "mean = ");
  1481. print_float_array_list(net.means);
  1482. fprintf(stderr, "\n");
  1483. fprintf(stderr, "norm = ");
  1484. print_float_array_list(net.norms);
  1485. fprintf(stderr, "\n");
  1486. fprintf(stderr, "shape = ");
  1487. print_int_array_list(net.shapes);
  1488. fprintf(stderr, "\n");
  1489. fprintf(stderr, "pixel = ");
  1490. print_pixel_type_list(net.type_to_pixels);
  1491. fprintf(stderr, "\n");
  1492. fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
  1493. fprintf(stderr, "method = %s\n", method.c_str());
  1494. fprintf(stderr, "---------------------------------------\n");
  1495. }
  1496. if (method == "kl")
  1497. {
  1498. net.quantize_KL();
  1499. }
  1500. else if (method == "aciq")
  1501. {
  1502. net.quantize_ACIQ();
  1503. }
  1504. else if (method == "eq")
  1505. {
  1506. net.quantize_EQ();
  1507. }
  1508. else
  1509. {
  1510. fprintf(stderr, "not implemented yet !\n");
  1511. fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
  1512. return -1;
  1513. }
  1514. net.print_quant_info();
  1515. net.save_table(outtable);
  1516. return 0;
  1517. }