You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise_arm.cpp 17 kB

7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolutiondepthwise_arm.h"
  15. #include "layer_type.h"
  16. #if __ARM_NEON
  17. #include <arm_neon.h>
  18. #endif // __ARM_NEON
  19. namespace ncnn {
  20. #include "convolutiondepthwise_3x3.h"
  21. #include "convolutiondepthwise_5x5.h"
  22. #include "convolutiondepthwise_3x3_int8.h"
  23. DEFINE_LAYER_CREATOR(ConvolutionDepthWise_arm)
  24. ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
  25. {
  26. activation = 0;
  27. }
  28. int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
  29. {
  30. Option opt_cpu = opt;
  31. opt_cpu.use_vulkan_compute = false;
  32. if (activation_type == 1)
  33. {
  34. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  35. ncnn::ParamDict pd;
  36. activation->load_param(pd);
  37. }
  38. else if (activation_type == 2)
  39. {
  40. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  41. ncnn::ParamDict pd;
  42. pd.set(0, activation_params[0]);// slope
  43. activation->load_param(pd);
  44. }
  45. else if (activation_type == 3)
  46. {
  47. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  48. ncnn::ParamDict pd;
  49. pd.set(0, activation_params[0]);// min
  50. pd.set(1, activation_params[1]);// max
  51. activation->load_param(pd);
  52. }
  53. else if (activation_type == 4)
  54. {
  55. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  56. ncnn::ParamDict pd;
  57. activation->load_param(pd);
  58. }
  59. if (activation)
  60. {
  61. activation->create_pipeline(opt_cpu);
  62. }
  63. // create Convolution op for each group
  64. const int maxk = kernel_w * kernel_h;
  65. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  66. for (int i=0; i<(int)group_ops.size(); i++)
  67. delete group_ops[i];
  68. group_ops.clear();
  69. if (channels == group && group == num_output)
  70. {
  71. // depth-wise specific
  72. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  73. {
  74. if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
  75. {
  76. return 0;
  77. }
  78. }
  79. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && use_int8_inference == false)
  80. {
  81. if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
  82. {
  83. return 0;
  84. }
  85. }
  86. }
  87. const int channels_g = channels / group;
  88. const int num_output_g = num_output / group;
  89. group_ops.resize(group);
  90. for (int g=0; g<group; g++)
  91. {
  92. Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
  93. Mat bias_data_g;
  94. if (bias_term)
  95. bias_data_g = bias_data.range(num_output_g * g, num_output_g);
  96. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
  97. // set param
  98. ncnn::ParamDict pd;
  99. pd.set(0, num_output_g);// num_output
  100. pd.set(1, kernel_w);
  101. pd.set(11, kernel_h);
  102. pd.set(2, dilation_w);
  103. pd.set(12, dilation_h);
  104. pd.set(3, stride_w);
  105. pd.set(13, stride_h);
  106. pd.set(4, 0);// pad_w
  107. pd.set(14, 0);// pad_h
  108. pd.set(5, bias_term);
  109. pd.set(6, maxk * channels_g * num_output_g);// weight_data_size
  110. pd.set(8, int8_scale_term);
  111. op->load_param(pd);
  112. // set weights
  113. if (bias_term)
  114. {
  115. ncnn::Mat weights[4];
  116. weights[0] = weight_data_g;
  117. weights[1] = bias_data_g;
  118. if (int8_scale_term)
  119. {
  120. weights[2] = weight_data_int8_scales.range(g, 1);
  121. weights[3] = bottom_blob_int8_scales.range(g, 1);
  122. }
  123. op->load_model(ModelBinFromMatArray(weights));
  124. }
  125. else
  126. {
  127. ncnn::Mat weights[3];
  128. weights[0] = weight_data_g;
  129. if (int8_scale_term)
  130. {
  131. weights[1] = weight_data_int8_scales.range(g, 1);
  132. weights[2] = bottom_blob_int8_scales.range(g, 1);
  133. }
  134. op->load_model(ModelBinFromMatArray(weights));
  135. }
  136. op->create_pipeline(opt_cpu);
  137. group_ops[g] = op;
  138. }
  139. return 0;
  140. }
  141. int ConvolutionDepthWise_arm::destroy_pipeline(const Option& opt)
  142. {
  143. Option opt_cpu = opt;
  144. opt_cpu.use_vulkan_compute = false;
  145. if (activation)
  146. {
  147. activation->destroy_pipeline(opt_cpu);
  148. delete activation;
  149. activation = 0;
  150. }
  151. for (int i=0; i<(int)group_ops.size(); i++)
  152. {
  153. group_ops[i]->destroy_pipeline(opt_cpu);
  154. delete group_ops[i];
  155. }
  156. group_ops.clear();
  157. return 0;
  158. }
  159. int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  160. {
  161. // convolv with NxN kernel
  162. // value = value + bias
  163. int w = bottom_blob.w;
  164. int h = bottom_blob.h;
  165. int channels = bottom_blob.c;
  166. size_t elemsize = bottom_blob.elemsize;
  167. if (channels % group != 0 || num_output % group != 0)
  168. {
  169. // reject invalid group
  170. return -100;
  171. }
  172. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  173. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  174. Mat bottom_blob_unbordered = bottom_blob;
  175. if (use_int8_inference && elemsize != 1)
  176. {
  177. Mat bottom_blob_int8;
  178. bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  179. if (bottom_blob_int8.empty())
  180. return -100;
  181. const int channels_g = channels / group;
  182. // quantize, scale and round to nearest
  183. #pragma omp parallel for num_threads(opt.num_threads)
  184. for (int g=0; g<group; g++)
  185. {
  186. ncnn::Option opt_g = opt;
  187. opt_g.num_threads = 1;
  188. opt_g.blob_allocator = bottom_blob_int8.allocator;
  189. const Mat bottom_blob_g = bottom_blob.channel_range(channels_g * g, channels_g);
  190. Mat bottom_blob_int8_g = bottom_blob_int8.channel_range(channels_g * g, channels_g);
  191. quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
  192. }
  193. bottom_blob_unbordered = bottom_blob_int8;
  194. }
  195. Mat bottom_blob_bordered = bottom_blob_unbordered;
  196. if (pad_w > 0 || pad_h > 0)
  197. {
  198. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  199. if (bottom_blob_bordered.empty())
  200. return -100;
  201. w = bottom_blob_bordered.w;
  202. h = bottom_blob_bordered.h;
  203. }
  204. else if (pad_w == -233 && pad_h == -233)
  205. {
  206. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  207. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  208. if (wpad > 0 || hpad > 0)
  209. {
  210. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  211. if (bottom_blob_bordered.empty())
  212. return -100;
  213. }
  214. w = bottom_blob_bordered.w;
  215. h = bottom_blob_bordered.h;
  216. }
  217. int outw = (w - kernel_extent_w) / stride_w + 1;
  218. int outh = (h - kernel_extent_h) / stride_h + 1;
  219. // int8
  220. if (use_int8_inference)
  221. {
  222. if (use_int8_requantize)
  223. {
  224. Mat top_blob_tm;
  225. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  226. if (top_blob_tm.empty())
  227. return -100;
  228. top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
  229. if (top_blob.empty())
  230. return -100;
  231. // depth-wise
  232. if (channels == group && group == num_output)
  233. {
  234. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  235. {
  236. if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
  237. {
  238. if (stride_w == 1 && stride_h == 1)
  239. {
  240. convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  241. }
  242. else if (stride_w == 2 && stride_h == 2)
  243. {
  244. convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  245. }
  246. return 0;
  247. }
  248. }
  249. #pragma omp parallel for num_threads(opt.num_threads)
  250. for (int g=0; g<group; g++)
  251. {
  252. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(g, 1);
  253. Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
  254. const ncnn::Layer* op = group_ops[g];
  255. ncnn::Option opt_g = opt;
  256. opt_g.num_threads = 1;
  257. opt_g.blob_allocator = top_blob.allocator;
  258. // forward
  259. op->forward(bottom_blob_bordered_g, top_blob_tm_g, opt_g);
  260. }
  261. return 0;
  262. }
  263. const int channels_g = channels / group;
  264. const int num_output_g = num_output / group;
  265. #pragma omp parallel for num_threads(opt.num_threads)
  266. for (int g=0; g<group; g++)
  267. {
  268. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
  269. Mat top_blob_tm_g = top_blob_tm.channel_range(num_output_g * g, num_output_g);
  270. const ncnn::Layer* op = group_ops[g];
  271. ncnn::Option opt_g = opt;
  272. opt_g.blob_allocator = top_blob.allocator;
  273. // forward
  274. op->forward(bottom_blob_bordered_g, top_blob_tm_g, opt_g);
  275. }
  276. }
  277. else
  278. {
  279. top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
  280. if (top_blob.empty())
  281. return -100;
  282. // depth-wise
  283. if (channels == group && group == num_output)
  284. {
  285. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  286. {
  287. if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
  288. {
  289. if (stride_w == 1 && stride_h == 1)
  290. {
  291. convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
  292. }
  293. else if (stride_w == 2 && stride_h == 2)
  294. {
  295. convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
  296. }
  297. // dequantize, reverse scale inplace
  298. #pragma omp parallel for num_threads(opt.num_threads)
  299. for (int g=0; g<group; g++)
  300. {
  301. ncnn::Option opt_g = opt;
  302. opt_g.num_threads = 1;
  303. opt_g.blob_allocator = top_blob.allocator;
  304. Mat top_blob_g = top_blob.channel(g);
  305. dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
  306. }
  307. return 0;
  308. }
  309. }
  310. #pragma omp parallel for num_threads(opt.num_threads)
  311. for (int g=0; g<group; g++)
  312. {
  313. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(g, 1);
  314. Mat top_blob_g = top_blob.channel_range(g, 1);
  315. const ncnn::Layer* op = group_ops[g];
  316. ncnn::Option opt_g = opt;
  317. opt_g.num_threads = 1;
  318. opt_g.blob_allocator = top_blob.allocator;
  319. // forward
  320. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  321. }
  322. return 0;
  323. }
  324. const int channels_g = channels / group;
  325. const int num_output_g = num_output / group;
  326. #pragma omp parallel for num_threads(opt.num_threads)
  327. for (int g=0; g<group; g++)
  328. {
  329. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
  330. Mat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
  331. const ncnn::Layer* op = group_ops[g];
  332. ncnn::Option opt_g = opt;
  333. opt_g.blob_allocator = top_blob.allocator;
  334. // forward
  335. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  336. }
  337. }
  338. return 0;
  339. }
  340. // float32
  341. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  342. if (top_blob.empty())
  343. return -100;
  344. // depth-wise
  345. if (channels == group && group == num_output)
  346. {
  347. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
  348. {
  349. if (stride_w == 1 && stride_h == 1)
  350. {
  351. convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  352. }
  353. else if (stride_w == 2 && stride_h == 2)
  354. {
  355. convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  356. }
  357. if (activation)
  358. {
  359. activation->forward_inplace(top_blob, opt);
  360. }
  361. return 0;
  362. }
  363. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1)
  364. {
  365. if (stride_w == 1 && stride_h == 1)
  366. {
  367. convdw5x5s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  368. }
  369. else if (stride_w == 2 && stride_h == 2)
  370. {
  371. convdw5x5s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  372. }
  373. if (activation)
  374. {
  375. activation->forward_inplace(top_blob, opt);
  376. }
  377. return 0;
  378. }
  379. #pragma omp parallel for num_threads(opt.num_threads)
  380. for (int g=0; g<group; g++)
  381. {
  382. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(g, 1);
  383. Mat top_blob_g = top_blob.channel_range(g, 1);
  384. const ncnn::Layer* op = group_ops[g];
  385. ncnn::Option opt_g = opt;
  386. opt_g.num_threads = 1;
  387. opt_g.blob_allocator = top_blob.allocator;
  388. // forward
  389. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  390. }
  391. if (activation)
  392. {
  393. activation->forward_inplace(top_blob, opt);
  394. }
  395. return 0;
  396. }
  397. const int channels_g = channels / group;
  398. const int num_output_g = num_output / group;
  399. for (int g=0; g<group; g++)
  400. {
  401. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
  402. Mat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
  403. const ncnn::Layer* op = group_ops[g];
  404. ncnn::Option opt_g = opt;
  405. opt_g.blob_allocator = top_blob.allocator;
  406. // forward
  407. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  408. }
  409. if (activation)
  410. {
  411. activation->forward_inplace(top_blob, opt);
  412. }
  413. return 0;
  414. }
  415. } // namespace ncnn