You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise_x86.cpp 15 kB

7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolutiondepthwise_x86.h"
  15. #ifdef _OPENMP
  16. #include <omp.h>
  17. #endif
  18. #include "layer_type.h"
  19. namespace ncnn {
  20. #include "convolutiondepthwise_3x3.h"
  21. #include "convolutiondepthwise_3x3_int8.h"
  22. DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86)
  23. ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
  24. {
  25. activation = 0;
  26. }
  27. int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
  28. {
  29. Option opt_cpu = opt;
  30. opt_cpu.vulkan_compute = false;
  31. if (activation_type == 1)
  32. {
  33. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  34. ncnn::ParamDict pd;
  35. activation->load_param(pd);
  36. }
  37. else if (activation_type == 2)
  38. {
  39. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  40. ncnn::ParamDict pd;
  41. pd.set(0, activation_params[0]);// slope
  42. activation->load_param(pd);
  43. }
  44. else if (activation_type == 3)
  45. {
  46. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  47. ncnn::ParamDict pd;
  48. pd.set(0, activation_params[0]);// min
  49. pd.set(1, activation_params[1]);// max
  50. activation->load_param(pd);
  51. }
  52. else if (activation_type == 4)
  53. {
  54. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  55. ncnn::ParamDict pd;
  56. activation->load_param(pd);
  57. }
  58. if (activation)
  59. {
  60. activation->create_pipeline(opt_cpu);
  61. }
  62. // create Convolution op for each group
  63. const int maxk = kernel_w * kernel_h;
  64. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  65. for (int i=0; i<(int)group_ops.size(); i++)
  66. delete group_ops[i];
  67. group_ops.clear();
  68. if (channels == group && group == num_output)
  69. {
  70. // depth-wise specific
  71. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  72. {
  73. if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
  74. {
  75. return 0;
  76. }
  77. }
  78. }
  79. const int channels_g = channels / group;
  80. const int num_output_g = num_output / group;
  81. group_ops.resize(group);
  82. for (int g=0; g<group; g++)
  83. {
  84. Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
  85. Mat bias_data_g;
  86. if (bias_term)
  87. bias_data_g = bias_data.range(num_output_g * g, num_output_g);
  88. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
  89. // set param
  90. ncnn::ParamDict pd;
  91. pd.set(0, num_output_g);// num_output
  92. pd.set(1, kernel_w);
  93. pd.set(11, kernel_h);
  94. pd.set(2, dilation_w);
  95. pd.set(12, dilation_h);
  96. pd.set(3, stride_w);
  97. pd.set(13, stride_h);
  98. pd.set(4, 0);// pad_w
  99. pd.set(14, 0);// pad_h
  100. pd.set(5, bias_term);
  101. pd.set(6, maxk * channels_g * num_output_g);// weight_data_size
  102. pd.set(8, int8_scale_term);
  103. op->load_param(pd);
  104. // set weights
  105. if (bias_term)
  106. {
  107. ncnn::Mat weights[4];
  108. weights[0] = weight_data_g;
  109. weights[1] = bias_data_g;
  110. if (int8_scale_term)
  111. {
  112. weights[2] = weight_data_int8_scales.range(g, 1);
  113. weights[3] = bottom_blob_int8_scales.range(g, 1);
  114. }
  115. op->load_model(ModelBinFromMatArray(weights));
  116. }
  117. else
  118. {
  119. ncnn::Mat weights[3];
  120. weights[0] = weight_data_g;
  121. if (int8_scale_term)
  122. {
  123. weights[1] = weight_data_int8_scales.range(g, 1);
  124. weights[2] = bottom_blob_int8_scales.range(g, 1);
  125. }
  126. op->load_model(ModelBinFromMatArray(weights));
  127. }
  128. op->create_pipeline(opt_cpu);
  129. group_ops[g] = op;
  130. }
  131. return 0;
  132. }
  133. int ConvolutionDepthWise_x86::destroy_pipeline(const Option& opt)
  134. {
  135. Option opt_cpu = opt;
  136. opt_cpu.vulkan_compute = false;
  137. if (activation)
  138. {
  139. activation->destroy_pipeline(opt_cpu);
  140. delete activation;
  141. activation = 0;
  142. }
  143. for (int i=0; i<(int)group_ops.size(); i++)
  144. {
  145. group_ops[i]->destroy_pipeline(opt_cpu);
  146. delete group_ops[i];
  147. }
  148. group_ops.clear();
  149. return 0;
  150. }
  151. int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  152. {
  153. // convolv with NxN kernel
  154. // value = value + bias
  155. int w = bottom_blob.w;
  156. int h = bottom_blob.h;
  157. int channels = bottom_blob.c;
  158. size_t elemsize = bottom_blob.elemsize;
  159. if (channels % group != 0 || num_output % group != 0)
  160. {
  161. // reject invalid group
  162. return -100;
  163. }
  164. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  165. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  166. Mat bottom_blob_unbordered = bottom_blob;
  167. if (use_int8_inference && elemsize != 1)
  168. {
  169. Mat bottom_blob_int8;
  170. bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  171. if (bottom_blob_int8.empty())
  172. return -100;
  173. const int channels_g = channels / group;
  174. // quantize, scale and round to nearest
  175. #pragma omp parallel for num_threads(opt.num_threads)
  176. for (int g=0; g<group; g++)
  177. {
  178. ncnn::Option opt_g = opt;
  179. opt_g.num_threads = 1;
  180. opt_g.blob_allocator = bottom_blob_int8.allocator;
  181. const Mat bottom_blob_g = bottom_blob.channel_range(channels_g * g, channels_g);
  182. Mat bottom_blob_int8_g = bottom_blob_int8.channel_range(channels_g * g, channels_g);
  183. quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
  184. }
  185. bottom_blob_unbordered = bottom_blob_int8;
  186. }
  187. Mat bottom_blob_bordered = bottom_blob_unbordered;
  188. if (pad_w > 0 || pad_h > 0)
  189. {
  190. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  191. if (bottom_blob_bordered.empty())
  192. return -100;
  193. w = bottom_blob_bordered.w;
  194. h = bottom_blob_bordered.h;
  195. }
  196. else if (pad_w == -233 && pad_h == -233)
  197. {
  198. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  199. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  200. if (wpad > 0 || hpad > 0)
  201. {
  202. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  203. if (bottom_blob_bordered.empty())
  204. return -100;
  205. }
  206. w = bottom_blob_bordered.w;
  207. h = bottom_blob_bordered.h;
  208. }
  209. int outw = (w - kernel_extent_w) / stride_w + 1;
  210. int outh = (h - kernel_extent_h) / stride_h + 1;
  211. // int8
  212. if (use_int8_inference)
  213. {
  214. if (use_int8_requantize)
  215. {
  216. Mat top_blob_tm;
  217. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  218. if (top_blob_tm.empty())
  219. return -100;
  220. top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
  221. if (top_blob.empty())
  222. return -100;
  223. // depth-wise
  224. if (channels == group && group == num_output)
  225. {
  226. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  227. {
  228. if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
  229. {
  230. if (stride_w == 1 && stride_h == 1)
  231. {
  232. convdw3x3s1_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  233. }
  234. else if (stride_w == 2 && stride_h == 2)
  235. {
  236. convdw3x3s2_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  237. }
  238. return 0;
  239. }
  240. }
  241. #pragma omp parallel for num_threads(opt.num_threads)
  242. for (int g=0; g<group; g++)
  243. {
  244. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(g, 1);
  245. Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
  246. const ncnn::Layer* op = group_ops[g];
  247. ncnn::Option opt_g = opt;
  248. opt_g.num_threads = 1;
  249. opt_g.blob_allocator = top_blob.allocator;
  250. // forward
  251. op->forward(bottom_blob_bordered_g, top_blob_tm_g, opt_g);
  252. }
  253. return 0;
  254. }
  255. const int channels_g = channels / group;
  256. const int num_output_g = num_output / group;
  257. #pragma omp parallel for num_threads(opt.num_threads)
  258. for (int g=0; g<group; g++)
  259. {
  260. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
  261. Mat top_blob_tm_g = top_blob_tm.channel_range(num_output_g * g, num_output_g);
  262. const ncnn::Layer* op = group_ops[g];
  263. ncnn::Option opt_g = opt;
  264. opt_g.blob_allocator = top_blob.allocator;
  265. // forward
  266. op->forward(bottom_blob_bordered_g, top_blob_tm_g, opt_g);
  267. }
  268. }
  269. else
  270. {
  271. top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
  272. if (top_blob.empty())
  273. return -100;
  274. // depth-wise
  275. if (channels == group && group == num_output)
  276. {
  277. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  278. {
  279. if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
  280. {
  281. if (stride_w == 1 && stride_h == 1)
  282. {
  283. convdw3x3s1_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
  284. }
  285. else if (stride_w == 2 && stride_h == 2)
  286. {
  287. convdw3x3s2_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
  288. }
  289. return 0;
  290. }
  291. }
  292. #pragma omp parallel for num_threads(opt.num_threads)
  293. for (int g=0; g<group; g++)
  294. {
  295. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(g, 1);
  296. Mat top_blob_g = top_blob.channel_range(g, 1);
  297. const ncnn::Layer* op = group_ops[g];
  298. ncnn::Option opt_g = opt;
  299. opt_g.num_threads = 1;
  300. opt_g.blob_allocator = top_blob.allocator;
  301. // forward
  302. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  303. }
  304. return 0;
  305. }
  306. const int channels_g = channels / group;
  307. const int num_output_g = num_output / group;
  308. #pragma omp parallel for num_threads(opt.num_threads)
  309. for (int g=0; g<group; g++)
  310. {
  311. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
  312. Mat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
  313. const ncnn::Layer* op = group_ops[g];
  314. ncnn::Option opt_g = opt;
  315. opt_g.blob_allocator = top_blob.allocator;
  316. // forward
  317. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  318. }
  319. }
  320. return 0;
  321. }
  322. // float32
  323. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  324. if (top_blob.empty())
  325. return -100;
  326. // depth-wise
  327. if (channels == group && group == num_output)
  328. {
  329. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  330. {
  331. if (stride_w == 1 && stride_h == 1)
  332. {
  333. convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  334. }
  335. else if (stride_w == 2 && stride_h == 2)
  336. {
  337. convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  338. }
  339. if (activation)
  340. {
  341. activation->forward_inplace(top_blob, opt);
  342. }
  343. return 0;
  344. }
  345. #pragma omp parallel for num_threads(opt.num_threads)
  346. for (int g=0; g<group; g++)
  347. {
  348. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(g, 1);
  349. Mat top_blob_g = top_blob.channel_range(g, 1);
  350. const ncnn::Layer* op = group_ops[g];
  351. ncnn::Option opt_g = opt;
  352. opt_g.num_threads = 1;
  353. opt_g.blob_allocator = top_blob.allocator;
  354. // forward
  355. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  356. }
  357. if (activation)
  358. {
  359. activation->forward_inplace(top_blob, opt);
  360. }
  361. return 0;
  362. }
  363. const int channels_g = channels / group;
  364. const int num_output_g = num_output / group;
  365. for (int g=0; g<group; g++)
  366. {
  367. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
  368. Mat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
  369. const ncnn::Layer* op = group_ops[g];
  370. ncnn::Option opt_g = opt;
  371. opt_g.blob_allocator = top_blob.allocator;
  372. // forward
  373. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  374. }
  375. if (activation)
  376. {
  377. activation->forward_inplace(top_blob, opt);
  378. }
  379. return 0;
  380. }
  381. } // namespace ncnn