You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolutiondepthwise.h"
  15. #include "layer_type.h"
  16. namespace ncnn {
  17. DEFINE_LAYER_CREATOR(ConvolutionDepthWise)
  18. ConvolutionDepthWise::ConvolutionDepthWise()
  19. {
  20. one_blob_only = true;
  21. support_inplace = false;
  22. }
  23. ConvolutionDepthWise::~ConvolutionDepthWise()
  24. {
  25. for (int i=0; i<(int)quantize_ops.size(); i++)
  26. delete quantize_ops[i];
  27. quantize_ops.clear();
  28. for (int i=0; i<(int)dequantize_ops.size(); i++)
  29. delete dequantize_ops[i];
  30. dequantize_ops.clear();
  31. }
  32. int ConvolutionDepthWise::load_param(const ParamDict& pd)
  33. {
  34. num_output = pd.get(0, 0);
  35. kernel_w = pd.get(1, 0);
  36. kernel_h = pd.get(11, kernel_w);
  37. dilation_w = pd.get(2, 1);
  38. dilation_h = pd.get(12, dilation_w);
  39. stride_w = pd.get(3, 1);
  40. stride_h = pd.get(13, stride_w);
  41. pad_w = pd.get(4, 0);
  42. pad_h = pd.get(14, pad_w);
  43. bias_term = pd.get(5, 0);
  44. weight_data_size = pd.get(6, 0);
  45. group = pd.get(7, 1);
  46. weight_data_int8_scales = pd.get(8, Mat());
  47. bottom_blob_int8_scales = pd.get(9, Mat());
  48. use_int8_inference = pd.use_int8_inference;
  49. if (num_output % group != 0)
  50. {
  51. // reject invalid group
  52. return -100;
  53. }
  54. if (weight_data_int8_scales.empty() || bottom_blob_int8_scales.empty())
  55. use_int8_inference = false;
  56. // extend group if only one provided
  57. if (weight_data_int8_scales.w == 1)
  58. {
  59. float scale = weight_data_int8_scales[0];
  60. weight_data_int8_scales = Mat(group);
  61. weight_data_int8_scales.fill(scale);
  62. }
  63. if (bottom_blob_int8_scales.w == 1)
  64. {
  65. float scale = bottom_blob_int8_scales[0];
  66. bottom_blob_int8_scales = Mat(group);
  67. bottom_blob_int8_scales.fill(scale);
  68. }
  69. return 0;
  70. }
  71. int ConvolutionDepthWise::load_model(const ModelBin& mb)
  72. {
  73. weight_data = mb.load(weight_data_size, 0);
  74. if (weight_data.empty())
  75. return -100;
  76. if (bias_term)
  77. {
  78. bias_data = mb.load(num_output, 1);
  79. if (bias_data.empty())
  80. return -100;
  81. }
  82. for (int i=0; i<(int)quantize_ops.size(); i++)
  83. delete quantize_ops[i];
  84. quantize_ops.clear();
  85. for (int i=0; i<(int)dequantize_ops.size(); i++)
  86. delete dequantize_ops[i];
  87. dequantize_ops.clear();
  88. bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
  89. bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
  90. if (weight_data_is_int8 && !use_int8_inference)
  91. {
  92. fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
  93. return -1;
  94. }
  95. if (use_int8_inference)
  96. {
  97. quantize_ops.resize(group);
  98. dequantize_ops.resize(group);
  99. for (int g=0; g<group; g++)
  100. {
  101. quantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Quantize);
  102. ncnn::ParamDict pd;
  103. pd.set(0, bottom_blob_int8_scales[g]);// scale
  104. quantize_ops[g]->load_param(pd);
  105. }
  106. for (int g=0; g<group; g++)
  107. {
  108. dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);
  109. float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  110. ncnn::ParamDict pd;
  111. pd.set(0, top_rescale);// scale
  112. pd.set(1, bias_term);// bias_term
  113. pd.set(2, 1);// bias_data_size
  114. dequantize_ops[g]->load_param(pd);
  115. ncnn::Mat weights[1];
  116. weights[0] = Mat(1, (void*)((const float*)bias_data + g));
  117. dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
  118. }
  119. }
  120. if (weight_data_is_float32 && use_int8_inference)
  121. {
  122. if (!weight_data_int8_scales.empty() && !bottom_blob_int8_scales.empty())
  123. {
  124. // quantize weight to int8
  125. Mat int8_weight_data(weight_data_size, (size_t)1u);
  126. if (int8_weight_data.empty())
  127. return -100;
  128. const int weight_data_size_g = weight_data_size / group;
  129. for (int g=0; g<group; g++)
  130. {
  131. ncnn::ParamDict pd;
  132. pd.set(0, weight_data_int8_scales[g]);// scale
  133. quantize_ops[g]->load_param(pd);
  134. ncnn::Option opt = ncnn::get_default_option();
  135. opt.blob_allocator = int8_weight_data.allocator;
  136. const Mat weight_data_g(weight_data_size_g, (void*)((float*)weight_data + weight_data_size_g * g), (size_t)4u, weight_data.allocator);
  137. Mat int8_weight_data_g(weight_data_size_g, (void*)((signed char*)int8_weight_data + weight_data_size_g * g), (size_t)1u, int8_weight_data.allocator);
  138. quantize_ops[g]->forward(weight_data_g, int8_weight_data_g, opt);
  139. }
  140. weight_data = int8_weight_data;
  141. }
  142. else
  143. {
  144. // plain float32 weight, fallback to float32 inference
  145. use_int8_inference = false;
  146. }
  147. }
  148. return 0;
  149. }
  150. int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  151. {
  152. // convolv with NxN kernel
  153. // value = value + bias
  154. int w = bottom_blob.w;
  155. int h = bottom_blob.h;
  156. int channels = bottom_blob.c;
  157. size_t elemsize = bottom_blob.elemsize;
  158. if (channels % group != 0 || num_output % group != 0)
  159. {
  160. // reject invalid group
  161. return -100;
  162. }
  163. // fprintf(stderr, "ConvolutionDepthWise input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  164. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  165. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  166. Mat bottom_blob_bordered = bottom_blob;
  167. if (pad_w > 0 || pad_h > 0)
  168. {
  169. copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  170. if (bottom_blob_bordered.empty())
  171. return -100;
  172. w = bottom_blob_bordered.w;
  173. h = bottom_blob_bordered.h;
  174. }
  175. else if (pad_w == -233 && pad_h == -233)
  176. {
  177. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  178. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  179. if (wpad > 0 || hpad > 0)
  180. {
  181. copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  182. if (bottom_blob_bordered.empty())
  183. return -100;
  184. }
  185. w = bottom_blob_bordered.w;
  186. h = bottom_blob_bordered.h;
  187. }
  188. int outw = (w - kernel_extent_w) / stride_w + 1;
  189. int outh = (h - kernel_extent_h) / stride_h + 1;
  190. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  191. if (top_blob.empty())
  192. return -100;
  193. const int maxk = kernel_w * kernel_h;
  194. // kernel offsets
  195. std::vector<int> _space_ofs(maxk);
  196. int* space_ofs = &_space_ofs[0];
  197. {
  198. int p1 = 0;
  199. int p2 = 0;
  200. int gap = w * dilation_h - kernel_w * dilation_w;
  201. for (int i = 0; i < kernel_h; i++)
  202. {
  203. for (int j = 0; j < kernel_w; j++)
  204. {
  205. space_ofs[p1] = p2;
  206. p1++;
  207. p2 += dilation_w;
  208. }
  209. p2 += gap;
  210. }
  211. }
  212. if (use_int8_inference)
  213. {
  214. Mat bottom_blob_bordered_int8;
  215. bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  216. if (bottom_blob_bordered_int8.empty())
  217. return -100;
  218. // depth-wise
  219. if (channels == group && group == num_output)
  220. {
  221. #pragma omp parallel for num_threads(opt.num_threads)
  222. for (int g=0; g<group; g++)
  223. {
  224. // quantize, scale and round to nearest
  225. {
  226. ncnn::Option opt_g = opt;
  227. opt_g.num_threads = 1;
  228. opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
  229. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g);
  230. Mat bottom_blob_bordered_int8_g = bottom_blob_bordered_int8.channel(g);
  231. quantize_ops[g]->forward(bottom_blob_bordered_g, bottom_blob_bordered_int8_g, opt_g);
  232. }
  233. int* outptr = top_blob.channel(g);
  234. const signed char* kptr = (const signed char*)weight_data + maxk * g;
  235. const Mat m = bottom_blob_bordered_int8.channel(g);
  236. for (int i = 0; i < outh; i++)
  237. {
  238. for (int j = 0; j < outw; j++)
  239. {
  240. int sum = 0;
  241. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  242. for (int k = 0; k < maxk; k++)
  243. {
  244. signed char val = sptr[ space_ofs[k] ];
  245. signed char w = kptr[k];
  246. sum += val * w;
  247. }
  248. outptr[j] = sum;
  249. }
  250. outptr += outw;
  251. }
  252. // dequantize, reverse scale inplace
  253. {
  254. ncnn::Option opt_g = opt;
  255. opt_g.num_threads = 1;
  256. opt_g.blob_allocator = top_blob.allocator;
  257. Mat top_blob_g = top_blob.channel(g);
  258. dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
  259. }
  260. }
  261. }
  262. else
  263. {
  264. const int channels_g = channels / group;
  265. const int num_output_g = num_output / group;
  266. // quantize, scale and round to nearest
  267. #pragma omp parallel for num_threads(opt.num_threads)
  268. for (int g=0; g<group; g++)
  269. {
  270. ncnn::Option opt_g = opt;
  271. opt_g.num_threads = 1;
  272. opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
  273. const Mat bottom_blob_bordered_g(w, h, channels_g, (void*)((float*)bottom_blob_bordered.channel(channels_g * g)));
  274. Mat bottom_blob_bordered_int8_g(w, h, channels_g, (void*)((signed char*)bottom_blob_bordered_int8.channel(channels_g * g)));
  275. quantize_ops[g]->forward(bottom_blob_bordered_g, bottom_blob_bordered_int8_g, opt_g);
  276. }
  277. #ifdef _WIN32
  278. #pragma omp parallel for num_threads(opt.num_threads)
  279. #else // _WIN32
  280. #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
  281. #endif // _WIN32
  282. for (int g=0; g<group; g++)
  283. {
  284. for (int p=0; p<num_output_g; p++)
  285. {
  286. int* outptr = top_blob.channel(g * num_output_g + p);
  287. const signed char* weight_data_ptr = (const signed char*)weight_data + maxk * channels_g * num_output_g * g;
  288. for (int i = 0; i < outh; i++)
  289. {
  290. for (int j = 0; j < outw; j++)
  291. {
  292. int sum = 0;
  293. const signed char* kptr = weight_data_ptr + maxk * channels_g * p;
  294. // channels_g
  295. for (int q=0; q<channels_g; q++)
  296. {
  297. const Mat m = bottom_blob_bordered_int8.channel(channels_g * g + q);
  298. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  299. for (int k = 0; k < maxk; k++)
  300. {
  301. signed char val = sptr[ space_ofs[k] ];
  302. signed char w = kptr[k];
  303. sum += val * w;
  304. }
  305. kptr += maxk;
  306. }
  307. outptr[j] = sum;
  308. }
  309. outptr += outw;
  310. }
  311. }
  312. }
  313. // dequantize, reverse scale inplace
  314. #pragma omp parallel for num_threads(opt.num_threads)
  315. for (int g=0; g<group; g++)
  316. {
  317. ncnn::Option opt_g = opt;
  318. opt_g.num_threads = 1;
  319. opt_g.blob_allocator = top_blob.allocator;
  320. Mat top_blob_g(outw, outh, num_output_g, (void*)((signed int*)top_blob.channel(g * num_output_g)));
  321. dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
  322. }
  323. }
  324. return 0;
  325. }
  326. // depth-wise
  327. if (channels == group && group == num_output)
  328. {
  329. #pragma omp parallel for num_threads(opt.num_threads)
  330. for (int g=0; g<group; g++)
  331. {
  332. float* outptr = top_blob.channel(g);
  333. const float* kptr = (const float*)weight_data + maxk * g;
  334. const Mat m = bottom_blob_bordered.channel(g);
  335. for (int i = 0; i < outh; i++)
  336. {
  337. for (int j = 0; j < outw; j++)
  338. {
  339. float sum = 0.f;
  340. if (bias_term)
  341. sum = bias_data[g];
  342. const float* sptr = m.row(i*stride_h) + j*stride_w;
  343. for (int k = 0; k < maxk; k++)
  344. {
  345. float val = sptr[ space_ofs[k] ];
  346. float w = kptr[k];
  347. sum += val * w;
  348. }
  349. outptr[j] = sum;
  350. }
  351. outptr += outw;
  352. }
  353. }
  354. return 0;
  355. }
  356. const int channels_g = channels / group;
  357. const int num_output_g = num_output / group;
  358. #ifdef _WIN32
  359. #pragma omp parallel for num_threads(opt.num_threads)
  360. #else // _WIN32
  361. #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
  362. #endif // _WIN32
  363. for (int g=0; g<group; g++)
  364. {
  365. for (int p=0; p<num_output_g; p++)
  366. {
  367. float* outptr = top_blob.channel(g * num_output_g + p);
  368. const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
  369. for (int i = 0; i < outh; i++)
  370. {
  371. for (int j = 0; j < outw; j++)
  372. {
  373. float sum = 0.f;
  374. if (bias_term)
  375. sum = bias_data[num_output_g * g + p];
  376. const float* kptr = weight_data_ptr + maxk * channels_g * p;
  377. // channels_g
  378. for (int q=0; q<channels_g; q++)
  379. {
  380. const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
  381. const float* sptr = m.row(i*stride_h) + j*stride_w;
  382. for (int k = 0; k < maxk; k++)
  383. {
  384. float val = sptr[ space_ofs[k] ];
  385. float w = kptr[k];
  386. sum += val * w;
  387. }
  388. kptr += maxk;
  389. }
  390. outptr[j] = sum;
  391. }
  392. outptr += outw;
  393. }
  394. }
  395. }
  396. return 0;
  397. }
  398. } // namespace ncnn