You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolutiondepthwise.h"
  15. #include "layer_type.h"
  16. namespace ncnn {
  17. DEFINE_LAYER_CREATOR(ConvolutionDepthWise)
  18. ConvolutionDepthWise::ConvolutionDepthWise()
  19. {
  20. one_blob_only = true;
  21. support_inplace = false;
  22. }
  23. ConvolutionDepthWise::~ConvolutionDepthWise()
  24. {
  25. for (int i=0; i<(int)quantize_ops.size(); i++)
  26. delete quantize_ops[i];
  27. quantize_ops.clear();
  28. for (int i=0; i<(int)dequantize_ops.size(); i++)
  29. delete dequantize_ops[i];
  30. dequantize_ops.clear();
  31. }
  32. int ConvolutionDepthWise::load_param(const ParamDict& pd)
  33. {
  34. num_output = pd.get(0, 0);
  35. kernel_w = pd.get(1, 0);
  36. kernel_h = pd.get(11, kernel_w);
  37. dilation_w = pd.get(2, 1);
  38. dilation_h = pd.get(12, dilation_w);
  39. stride_w = pd.get(3, 1);
  40. stride_h = pd.get(13, stride_w);
  41. pad_w = pd.get(4, 0);
  42. pad_h = pd.get(14, pad_w);
  43. bias_term = pd.get(5, 0);
  44. weight_data_size = pd.get(6, 0);
  45. group = pd.get(7, 1);
  46. int8_scale_term = pd.get(8, 0);
  47. use_int8_inference = pd.use_int8_inference;
  48. if (num_output % group != 0)
  49. {
  50. // reject invalid group
  51. return -100;
  52. }
  53. if (int8_scale_term == 0)
  54. use_int8_inference = false;
  55. return 0;
  56. }
  57. int ConvolutionDepthWise::load_model(const ModelBin& mb)
  58. {
  59. weight_data = mb.load(weight_data_size, 0);
  60. if (weight_data.empty())
  61. return -100;
  62. if (bias_term)
  63. {
  64. bias_data = mb.load(num_output, 1);
  65. if (bias_data.empty())
  66. return -100;
  67. }
  68. if (int8_scale_term == 1)
  69. {
  70. weight_data_int8_scales = mb.load(group, 1);
  71. bottom_blob_int8_scales = mb.load(group, 1);
  72. }
  73. else if (int8_scale_term == 2)
  74. {
  75. weight_data_int8_scales = mb.load(1, 1);
  76. bottom_blob_int8_scales = mb.load(1, 1);
  77. // extend group if only one provided
  78. float weight_data_int8_scale = weight_data_int8_scales[0];
  79. weight_data_int8_scales = Mat(group);
  80. weight_data_int8_scales.fill(weight_data_int8_scale);
  81. float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
  82. bottom_blob_int8_scales = Mat(group);
  83. bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
  84. }
  85. for (int i=0; i<(int)quantize_ops.size(); i++)
  86. delete quantize_ops[i];
  87. quantize_ops.clear();
  88. for (int i=0; i<(int)dequantize_ops.size(); i++)
  89. delete dequantize_ops[i];
  90. dequantize_ops.clear();
  91. bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
  92. bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
  93. if (weight_data_is_int8 && !use_int8_inference)
  94. {
  95. fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
  96. return -1;
  97. }
  98. if (weight_data_is_float32 && use_int8_inference)
  99. {
  100. // quantize weight to int8
  101. Mat int8_weight_data(weight_data_size, (size_t)1u);
  102. if (int8_weight_data.empty())
  103. return -100;
  104. const int weight_data_size_g = weight_data_size / group;
  105. for (int g=0; g<group; g++)
  106. {
  107. Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
  108. ncnn::ParamDict pd;
  109. pd.set(0, weight_data_int8_scales[g]);// scale
  110. op->load_param(pd);
  111. ncnn::Option opt = ncnn::get_default_option();
  112. opt.blob_allocator = int8_weight_data.allocator;
  113. const Mat weight_data_g = weight_data.range(weight_data_size_g * g, weight_data_size_g);
  114. Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g);
  115. op->forward(weight_data_g, int8_weight_data_g, opt);
  116. delete op;
  117. }
  118. weight_data = int8_weight_data;
  119. }
  120. if (use_int8_inference)
  121. {
  122. quantize_ops.resize(group);
  123. dequantize_ops.resize(group);
  124. for (int g=0; g<group; g++)
  125. {
  126. quantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Quantize);
  127. ncnn::ParamDict pd;
  128. pd.set(0, bottom_blob_int8_scales[g]);// scale
  129. quantize_ops[g]->load_param(pd);
  130. }
  131. for (int g=0; g<group; g++)
  132. {
  133. dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);
  134. float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  135. ncnn::ParamDict pd;
  136. pd.set(0, top_rescale);// scale
  137. pd.set(1, bias_term);// bias_term
  138. pd.set(2, 1);// bias_data_size
  139. dequantize_ops[g]->load_param(pd);
  140. ncnn::Mat weights[1];
  141. weights[0] = bias_data.range(g, 1);
  142. dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
  143. }
  144. }
  145. return 0;
  146. }
  147. int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  148. {
  149. // convolv with NxN kernel
  150. // value = value + bias
  151. int w = bottom_blob.w;
  152. int h = bottom_blob.h;
  153. int channels = bottom_blob.c;
  154. size_t elemsize = bottom_blob.elemsize;
  155. if (channels % group != 0 || num_output % group != 0)
  156. {
  157. // reject invalid group
  158. return -100;
  159. }
  160. // fprintf(stderr, "ConvolutionDepthWise input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  161. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  162. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  163. Mat bottom_blob_unbordered = bottom_blob;
  164. if (use_int8_inference && elemsize != 1)
  165. {
  166. Mat bottom_blob_int8;
  167. bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  168. if (bottom_blob_int8.empty())
  169. return -100;
  170. const int channels_g = channels / group;
  171. // quantize, scale and round to nearest
  172. #pragma omp parallel for num_threads(opt.num_threads)
  173. for (int g=0; g<group; g++)
  174. {
  175. ncnn::Option opt_g = opt;
  176. opt_g.num_threads = 1;
  177. opt_g.blob_allocator = bottom_blob_int8.allocator;
  178. const Mat bottom_blob_g = bottom_blob.channel_range(channels_g * g, channels_g);
  179. Mat bottom_blob_int8_g = bottom_blob_int8.channel_range(channels_g * g, channels_g);
  180. quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
  181. }
  182. bottom_blob_unbordered = bottom_blob_int8;
  183. }
  184. Mat bottom_blob_bordered = bottom_blob_unbordered;
  185. if (pad_w > 0 || pad_h > 0)
  186. {
  187. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  188. if (bottom_blob_bordered.empty())
  189. return -100;
  190. w = bottom_blob_bordered.w;
  191. h = bottom_blob_bordered.h;
  192. }
  193. else if (pad_w == -233 && pad_h == -233)
  194. {
  195. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  196. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  197. if (wpad > 0 || hpad > 0)
  198. {
  199. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  200. if (bottom_blob_bordered.empty())
  201. return -100;
  202. }
  203. w = bottom_blob_bordered.w;
  204. h = bottom_blob_bordered.h;
  205. }
  206. int outw = (w - kernel_extent_w) / stride_w + 1;
  207. int outh = (h - kernel_extent_h) / stride_h + 1;
  208. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  209. if (top_blob.empty())
  210. return -100;
  211. const int maxk = kernel_w * kernel_h;
  212. // kernel offsets
  213. std::vector<int> _space_ofs(maxk);
  214. int* space_ofs = &_space_ofs[0];
  215. {
  216. int p1 = 0;
  217. int p2 = 0;
  218. int gap = w * dilation_h - kernel_w * dilation_w;
  219. for (int i = 0; i < kernel_h; i++)
  220. {
  221. for (int j = 0; j < kernel_w; j++)
  222. {
  223. space_ofs[p1] = p2;
  224. p1++;
  225. p2 += dilation_w;
  226. }
  227. p2 += gap;
  228. }
  229. }
  230. if (use_int8_inference)
  231. {
  232. // depth-wise
  233. if (channels == group && group == num_output)
  234. {
  235. #pragma omp parallel for num_threads(opt.num_threads)
  236. for (int g=0; g<group; g++)
  237. {
  238. int* outptr = top_blob.channel(g);
  239. const signed char* kptr = (const signed char*)weight_data + maxk * g;
  240. const Mat m = bottom_blob_bordered.channel(g);
  241. for (int i = 0; i < outh; i++)
  242. {
  243. for (int j = 0; j < outw; j++)
  244. {
  245. int sum = 0;
  246. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  247. for (int k = 0; k < maxk; k++)
  248. {
  249. signed char val = sptr[ space_ofs[k] ];
  250. signed char w = kptr[k];
  251. sum += val * w;
  252. }
  253. outptr[j] = sum;
  254. }
  255. outptr += outw;
  256. }
  257. // dequantize, reverse scale inplace
  258. {
  259. ncnn::Option opt_g = opt;
  260. opt_g.num_threads = 1;
  261. opt_g.blob_allocator = top_blob.allocator;
  262. Mat top_blob_g = top_blob.channel_range(g, 1);
  263. dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
  264. }
  265. }
  266. }
  267. else
  268. {
  269. const int channels_g = channels / group;
  270. const int num_output_g = num_output / group;
  271. #ifdef _WIN32
  272. #pragma omp parallel for num_threads(opt.num_threads)
  273. #else // _WIN32
  274. #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
  275. #endif // _WIN32
  276. for (int g=0; g<group; g++)
  277. {
  278. for (int p=0; p<num_output_g; p++)
  279. {
  280. int* outptr = top_blob.channel(g * num_output_g + p);
  281. const signed char* weight_data_ptr = (const signed char*)weight_data + maxk * channels_g * num_output_g * g;
  282. for (int i = 0; i < outh; i++)
  283. {
  284. for (int j = 0; j < outw; j++)
  285. {
  286. int sum = 0;
  287. const signed char* kptr = weight_data_ptr + maxk * channels_g * p;
  288. // channels_g
  289. for (int q=0; q<channels_g; q++)
  290. {
  291. const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
  292. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  293. for (int k = 0; k < maxk; k++)
  294. {
  295. signed char val = sptr[ space_ofs[k] ];
  296. signed char w = kptr[k];
  297. sum += val * w;
  298. }
  299. kptr += maxk;
  300. }
  301. outptr[j] = sum;
  302. }
  303. outptr += outw;
  304. }
  305. }
  306. }
  307. // dequantize, reverse scale inplace
  308. #pragma omp parallel for num_threads(opt.num_threads)
  309. for (int g=0; g<group; g++)
  310. {
  311. ncnn::Option opt_g = opt;
  312. opt_g.num_threads = 1;
  313. opt_g.blob_allocator = top_blob.allocator;
  314. Mat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
  315. dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
  316. }
  317. }
  318. return 0;
  319. }
  320. // depth-wise
  321. if (channels == group && group == num_output)
  322. {
  323. #pragma omp parallel for num_threads(opt.num_threads)
  324. for (int g=0; g<group; g++)
  325. {
  326. float* outptr = top_blob.channel(g);
  327. const float* kptr = (const float*)weight_data + maxk * g;
  328. const Mat m = bottom_blob_bordered.channel(g);
  329. for (int i = 0; i < outh; i++)
  330. {
  331. for (int j = 0; j < outw; j++)
  332. {
  333. float sum = 0.f;
  334. if (bias_term)
  335. sum = bias_data[g];
  336. const float* sptr = m.row(i*stride_h) + j*stride_w;
  337. for (int k = 0; k < maxk; k++)
  338. {
  339. float val = sptr[ space_ofs[k] ];
  340. float w = kptr[k];
  341. sum += val * w;
  342. }
  343. outptr[j] = sum;
  344. }
  345. outptr += outw;
  346. }
  347. }
  348. return 0;
  349. }
  350. const int channels_g = channels / group;
  351. const int num_output_g = num_output / group;
  352. #ifdef _WIN32
  353. #pragma omp parallel for num_threads(opt.num_threads)
  354. #else // _WIN32
  355. #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
  356. #endif // _WIN32
  357. for (int g=0; g<group; g++)
  358. {
  359. for (int p=0; p<num_output_g; p++)
  360. {
  361. float* outptr = top_blob.channel(g * num_output_g + p);
  362. const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
  363. for (int i = 0; i < outh; i++)
  364. {
  365. for (int j = 0; j < outw; j++)
  366. {
  367. float sum = 0.f;
  368. if (bias_term)
  369. sum = bias_data[num_output_g * g + p];
  370. const float* kptr = weight_data_ptr + maxk * channels_g * p;
  371. // channels_g
  372. for (int q=0; q<channels_g; q++)
  373. {
  374. const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
  375. const float* sptr = m.row(i*stride_h) + j*stride_w;
  376. for (int k = 0; k < maxk; k++)
  377. {
  378. float val = sptr[ space_ofs[k] ];
  379. float w = kptr[k];
  380. sum += val * w;
  381. }
  382. kptr += maxk;
  383. }
  384. outptr[j] = sum;
  385. }
  386. outptr += outw;
  387. }
  388. }
  389. }
  390. return 0;
  391. }
  392. } // namespace ncnn