You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution_x86.cpp 18 kB

7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution_x86.h"
  15. #include "platform.h"
  16. #if NCNN_AVX2
  17. #include <immintrin.h>
  18. #endif
  19. #include "layer_type.h"
  20. #include "benchmark.h"
  21. namespace ncnn {
  22. #include "convolution_sgemm.h"
  23. #include "convolution_1x1.h"
  24. #include "convolution_3x3.h"
  25. #include "convolution_5x5.h"
  26. #include "convolution_7x7.h"
  27. #include "convolution_sgemm_int8.h"
  28. #include "convolution_1x1_int8.h"
  29. #include "convolution_3x3_int8.h"
  30. #include "convolution_5x5_int8.h"
  31. #include "convolution_7x7_int8.h"
  32. DEFINE_LAYER_CREATOR(Convolution_x86)
  33. Convolution_x86::Convolution_x86()
  34. {
  35. activation = 0;
  36. }
  37. int Convolution_x86::create_pipeline(const Option& opt)
  38. {
  39. Option opt_cpu = opt;
  40. opt_cpu.use_vulkan_compute = false;
  41. if (activation_type == 1)
  42. {
  43. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  44. ncnn::ParamDict pd;
  45. activation->load_param(pd);
  46. }
  47. else if (activation_type == 2)
  48. {
  49. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  50. ncnn::ParamDict pd;
  51. pd.set(0, activation_params[0]);// slope
  52. activation->load_param(pd);
  53. }
  54. else if (activation_type == 3)
  55. {
  56. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  57. ncnn::ParamDict pd;
  58. pd.set(0, activation_params[0]);// min
  59. pd.set(1, activation_params[1]);// max
  60. activation->load_param(pd);
  61. }
  62. else if (activation_type == 4)
  63. {
  64. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  65. ncnn::ParamDict pd;
  66. activation->load_param(pd);
  67. }
  68. if (activation)
  69. {
  70. activation->create_pipeline(opt_cpu);
  71. }
  72. use_winograd3x3 = false;
  73. if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  74. {
  75. int num_input = weight_data_size / 9 / num_output;
  76. // winograd is slow on small channel count
  77. if(num_input >= 16 && num_output >= 16)
  78. use_winograd3x3 = true;
  79. }
  80. if (use_winograd3x3)
  81. {
  82. int num_input = weight_data_size / 9 / num_output;
  83. if (use_int8_inference)
  84. // conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
  85. conv3x3s1_winograd43_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
  86. else
  87. // conv3x3s1_winograd23_transform_kernel_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
  88. conv3x3s1_winograd43_transform_kernel_sse(weight_data, weight_3x3_winograd43_data, num_input, num_output);
  89. }
  90. if (use_int8_inference == false)
  91. {
  92. int kernel_size = kernel_w * kernel_h;
  93. int num_input = weight_data_size / kernel_size / num_output;
  94. conv_im2col_sgemm_transform_kernel_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_size);
  95. }
  96. return 0;
  97. }
  98. int Convolution_x86::destroy_pipeline(const Option& opt)
  99. {
  100. Option opt_cpu = opt;
  101. opt_cpu.use_vulkan_compute = false;
  102. if (activation)
  103. {
  104. activation->destroy_pipeline(opt_cpu);
  105. delete activation;
  106. activation = 0;
  107. }
  108. return 0;
  109. }
  110. int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
  111. {
  112. int w = bottom_blob.w;
  113. int h = bottom_blob.h;
  114. size_t elemsize = bottom_blob.elemsize;
  115. const int kernel_size = kernel_w;
  116. const int stride = stride_w;
  117. const int dilation = dilation_w;
  118. const int kernel_extent = dilation * (kernel_size - 1) + 1;
  119. Mat bottom_blob_bordered = bottom_blob;
  120. if (pad_w > 0 || pad_h > 0)
  121. {
  122. copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  123. if (bottom_blob_bordered.empty())
  124. return -100;
  125. w = bottom_blob_bordered.w;
  126. h = bottom_blob_bordered.h;
  127. }
  128. else if (pad_w == -233 && pad_h == -233)
  129. {
  130. int wpad = kernel_extent + (w - 1) / stride * stride - w;
  131. int hpad = kernel_extent + (h - 1) / stride * stride - h;
  132. if (wpad > 0 || hpad > 0)
  133. {
  134. copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  135. if (bottom_blob_bordered.empty())
  136. return -100;
  137. }
  138. w = bottom_blob_bordered.w;
  139. h = bottom_blob_bordered.h;
  140. }
  141. int outw = (w - kernel_extent) / stride + 1;
  142. int outh = (h - kernel_extent) / stride + 1;
  143. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  144. if (top_blob.empty())
  145. return -100;
  146. // Make (dilation * dilation) batches
  147. Mat inner_bottom_blob;
  148. Mat inner_top_blob;
  149. for (int x = 0; x < dilation; x ++)
  150. {
  151. for (int y = 0; y < dilation; y ++)
  152. {
  153. int inner_w = (w - y + dilation - 1) / dilation;
  154. int inner_h = (h - x + dilation - 1) / dilation;
  155. int inner_outw = (inner_w - kernel_size) / stride + 1;
  156. int inner_outh = (inner_h - kernel_size) / stride + 1;
  157. inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
  158. if (inner_bottom_blob.empty())
  159. return -100;
  160. inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
  161. if (inner_top_blob.empty())
  162. return -100;
  163. #pragma omp parallel for num_threads(opt.num_threads)
  164. for (int c = 0; c < bottom_blob.c; c ++)
  165. {
  166. float *outptr = inner_bottom_blob.channel(c);
  167. for (int i = 0; i < inner_h; i ++)
  168. {
  169. const float* ptr = (const float *)bottom_blob_bordered.channel(c) + dilation * i * w + x * w + y;
  170. for (int j = 0; j < inner_w; j ++)
  171. {
  172. outptr[j] = ptr[j*dilation];
  173. }
  174. outptr += inner_w;
  175. }
  176. }
  177. ncnn::Option opt_g = opt;
  178. opt_g.blob_allocator = inner_top_blob.allocator;
  179. conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt_g);
  180. #pragma omp parallel for num_threads(opt.num_threads)
  181. for (int c = 0; c < num_output; c ++)
  182. {
  183. float *outptr = (float *)top_blob.channel(c) + x * outw + y;
  184. for (int i = 0; i < inner_outh; i ++)
  185. {
  186. const float* ptr = (const float *)inner_top_blob.channel(c) + i * inner_outw;
  187. for (int j = 0; j < inner_outw; j ++)
  188. {
  189. outptr[j*dilation] = ptr[j];
  190. }
  191. outptr += dilation * outw;
  192. }
  193. }
  194. }
  195. }
  196. return 0;
  197. }
  198. int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  199. {
  200. // convolv with NxN kernel
  201. // value = value + bias
  202. if (bottom_blob.dims != 3)
  203. {
  204. return Convolution::forward(bottom_blob, top_blob, opt);
  205. }
  206. if (kernel_w != kernel_h || stride_w != stride_h)
  207. {
  208. return Convolution::forward(bottom_blob, top_blob, opt);
  209. }
  210. const int kernel_size = kernel_w;
  211. const int stride = stride_w;
  212. if (kernel_size > 7 || stride > 7 || dilation_w != dilation_h)
  213. {
  214. return Convolution::forward(bottom_blob, top_blob, opt);
  215. }
  216. typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
  217. // kernel_size x stride
  218. conv_func conv_func_table[7][4] =
  219. {
  220. {
  221. conv1x1s1_sse,
  222. conv1x1s2_sse,
  223. 0,
  224. 0
  225. }, // kernel_size = 1
  226. {
  227. 0,
  228. 0,
  229. 0,
  230. 0
  231. }, // kernel_size = 2
  232. {
  233. conv3x3s1_sse,
  234. conv3x3s2_sse,
  235. 0,
  236. 0
  237. }, // kernel_size = 3
  238. {
  239. 0,
  240. 0,
  241. 0,
  242. 0
  243. }, // kernel_size = 4
  244. {
  245. conv5x5s1_sse,
  246. conv5x5s2_sse,
  247. 0,
  248. 0
  249. }, // kernel_size = 5
  250. {
  251. 0,
  252. 0,
  253. 0,
  254. 0
  255. }, // kernel_size = 6
  256. {
  257. conv7x7s1_sse,
  258. conv7x7s2_sse,
  259. 0,
  260. 0
  261. } // kernel_size = 7
  262. };
  263. typedef void (*conv_int8_dequant_func)(const Mat&, Mat&, const Mat&, const Mat&, std::vector<float>, const Option&);
  264. typedef void (*conv_int8_requant_func)(const Mat&, Mat&, const Mat&, const Mat&, std::vector<float>, const Option&);
  265. // kernel_size x stride
  266. conv_int8_dequant_func conv_int8_dequant_func_table[7][4] =
  267. {
  268. {
  269. conv1x1s1_int8_dequant_sse,
  270. conv1x1s2_int8_dequant_sse,
  271. 0,
  272. 0
  273. }, // kernel_size = 1
  274. {
  275. 0,
  276. 0,
  277. 0,
  278. 0
  279. }, // kernel_size = 2
  280. {
  281. conv3x3s1_int8_dequant_sse,
  282. conv3x3s2_int8_dequant_sse,
  283. 0,
  284. 0,
  285. }, // kernel_size = 3
  286. {
  287. 0,
  288. 0,
  289. 0,
  290. 0
  291. }, // kernel_size = 4
  292. {
  293. conv5x5s1_int8_dequant_sse,
  294. conv5x5s2_int8_dequant_sse,
  295. 0,
  296. 0
  297. }, // kernel_size = 5
  298. {
  299. 0,
  300. 0,
  301. 0,
  302. 0
  303. }, // kernel_size = 6
  304. {
  305. conv7x7s1_int8_dequant_sse,
  306. conv7x7s2_int8_dequant_sse,
  307. 0,
  308. 0
  309. } // kernel_size = 7
  310. };
  311. conv_int8_requant_func conv_int8_requant_func_table[7][4] =
  312. {
  313. {
  314. conv1x1s1_int8_requant_sse,
  315. conv1x1s2_int8_requant_sse,
  316. 0,
  317. 0
  318. }, // kernel_size = 1
  319. {
  320. 0,
  321. 0,
  322. 0,
  323. 0
  324. }, // kernel_size = 2
  325. {
  326. conv3x3s1_int8_requant_sse,
  327. conv3x3s2_int8_requant_sse,
  328. 0,
  329. 0,
  330. }, // kernel_size = 3
  331. {
  332. 0,
  333. 0,
  334. 0,
  335. 0
  336. }, // kernel_size = 4
  337. {
  338. conv5x5s1_int8_requant_sse,
  339. conv5x5s2_int8_requant_sse,
  340. 0,
  341. 0
  342. }, // kernel_size = 5
  343. {
  344. 0,
  345. 0,
  346. 0,
  347. 0
  348. }, // kernel_size = 6
  349. {
  350. conv7x7s1_int8_requant_sse,
  351. conv7x7s2_int8_requant_sse,
  352. 0,
  353. 0
  354. } // kernel_size = 7
  355. };
  356. conv_func conv = 0;
  357. conv_int8_dequant_func conv_int8_dequant = 0;
  358. conv_int8_requant_func conv_int8_requant = 0;
  359. if (use_int8_inference)
  360. {
  361. if (use_int8_requantize)
  362. conv_int8_requant = conv_int8_requant_func_table[kernel_size-1][stride-1];
  363. else
  364. conv_int8_dequant = conv_int8_dequant_func_table[kernel_size-1][stride-1];
  365. if ((!conv_int8_requant) && (!conv_int8_dequant))
  366. {
  367. return Convolution::forward(bottom_blob, top_blob, opt);
  368. }
  369. }
  370. else
  371. {
  372. conv = conv_func_table[kernel_size-1][stride-1];
  373. if (!conv)
  374. {
  375. return Convolution::forward(bottom_blob, top_blob, opt);
  376. }
  377. if (dilation_w != 1)
  378. {
  379. if (stride != 1)
  380. return Convolution::forward(bottom_blob, top_blob, opt);
  381. return forwardDilation(bottom_blob, top_blob, conv, opt);
  382. }
  383. }
  384. int w = bottom_blob.w;
  385. int h = bottom_blob.h;
  386. int channels = bottom_blob.c;
  387. size_t elemsize = bottom_blob.elemsize;
  388. Mat bottom_blob_unbordered = bottom_blob;
  389. if (use_int8_inference && elemsize != 1)
  390. {
  391. Mat bottom_blob_int8;
  392. bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  393. if (bottom_blob_int8.empty())
  394. return -100;
  395. // quantize, scale and round to nearest
  396. {
  397. ncnn::Option opt_g = opt;
  398. opt_g.blob_allocator = bottom_blob_int8.allocator;
  399. quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
  400. }
  401. bottom_blob_unbordered = bottom_blob_int8;
  402. }
  403. Mat bottom_blob_bordered = bottom_blob_unbordered;
  404. if (pad_w > 0 || pad_h > 0)
  405. {
  406. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  407. if (bottom_blob_bordered.empty())
  408. return -100;
  409. w = bottom_blob_bordered.w;
  410. h = bottom_blob_bordered.h;
  411. }
  412. else if (pad_w == -233 && pad_h == -233)
  413. {
  414. int wpad = kernel_size + (w - 1) / stride * stride - w;
  415. int hpad = kernel_size + (h - 1) / stride * stride - h;
  416. if (wpad > 0 || hpad > 0)
  417. {
  418. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  419. if (bottom_blob_bordered.empty())
  420. return -100;
  421. }
  422. w = bottom_blob_bordered.w;
  423. h = bottom_blob_bordered.h;
  424. }
  425. int outw = (w - kernel_size) / stride + 1;
  426. int outh = (h - kernel_size) / stride + 1;
  427. // int8
  428. if (use_int8_inference)
  429. {
  430. if (use_int8_requantize == true)
  431. {
  432. Mat top_blob_tm;
  433. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  434. if (top_blob_tm.empty())
  435. return -100;
  436. top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
  437. if (top_blob.empty())
  438. return -100;
  439. if (use_winograd3x3)
  440. {
  441. // conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data, opt);
  442. conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data, opt);
  443. // requantize, reverse scale inplace
  444. #pragma omp parallel for num_threads(opt.num_threads)
  445. for (int p=0; p<num_output; p++)
  446. {
  447. ncnn::Option opt_g = opt;
  448. opt_g.num_threads = 1;
  449. opt_g.blob_allocator = top_blob.allocator;
  450. Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
  451. Mat top_blob_g = top_blob.channel_range(p, 1);
  452. requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
  453. }
  454. }
  455. else
  456. conv_int8_requant(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  457. }
  458. else
  459. {
  460. top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
  461. if (top_blob.empty())
  462. return -100;
  463. if (use_winograd3x3)
  464. {
  465. // conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, opt);
  466. conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, opt);
  467. // dequantize, reverse scale inplace
  468. #pragma omp parallel for num_threads(opt.num_threads)
  469. for (int p=0; p<num_output; p++)
  470. {
  471. ncnn::Option opt_g = opt;
  472. opt_g.num_threads = 1;
  473. opt_g.blob_allocator = top_blob.allocator;
  474. Mat top_blob_g = top_blob.channel_range(p, 1);
  475. dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
  476. }
  477. }
  478. else
  479. conv_int8_dequant(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
  480. }
  481. return 0;
  482. }
  483. // float32
  484. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  485. if (top_blob.empty())
  486. return -100;
  487. if (use_winograd3x3 && outw >= 8 && outh >=8)
  488. {
  489. // conv3x3s1_winograd23_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, bias_data, opt);
  490. conv3x3s1_winograd43_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd43_data, bias_data, opt);
  491. }
  492. else
  493. //conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  494. conv_im2col_sgemm_sse(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  495. if (activation)
  496. {
  497. activation->forward_inplace(top_blob, opt);
  498. }
  499. return 0;
  500. }
  501. } // namespace ncnn