You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise_vulkan.cpp 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolutiondepthwise_vulkan.h"
  15. #include <algorithm>
  16. #include "layer_type.h"
  17. namespace ncnn {
  18. DEFINE_LAYER_CREATOR(ConvolutionDepthWise_vulkan)
  19. ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
  20. {
  21. support_vulkan = true;
  22. padding = 0;
  23. packing_pack1 = 0;
  24. packing_pack4 = 0;
  25. pipeline_convolutiondepthwise = 0;
  26. pipeline_convolutiondepthwise_pack4 = 0;
  27. pipeline_convolutiondepthwise_group = 0;
  28. pipeline_convolutiondepthwise_group_pack4 = 0;
  29. pipeline_convolutiondepthwise_group_pack1to4 = 0;
  30. pipeline_convolutiondepthwise_group_pack4to1 = 0;
  31. }
  32. int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt)
  33. {
  34. {
  35. padding = ncnn::create_layer(ncnn::LayerType::Padding);
  36. padding->vkdev = vkdev;
  37. ncnn::ParamDict pd;
  38. pd.set(0, pad_top);
  39. pd.set(1, pad_bottom);
  40. pd.set(2, pad_left);
  41. pd.set(3, pad_right);
  42. pd.set(4, 0);
  43. pd.set(5, 0.f);
  44. padding->load_param(pd);
  45. padding->create_pipeline(opt);
  46. }
  47. std::vector<vk_specialization_type> specializations(11);
  48. specializations[0].i = kernel_w;
  49. specializations[1].i = kernel_h;
  50. specializations[2].i = dilation_w;
  51. specializations[3].i = dilation_h;
  52. specializations[4].i = stride_w;
  53. specializations[5].i = stride_h;
  54. specializations[6].i = bias_term;
  55. specializations[7].i = group;
  56. specializations[8].i = activation_type;
  57. specializations[9].f = activation_params.w == 1 ? activation_params[0] : 0.f;
  58. specializations[10].f = activation_params.w == 2 ? activation_params[1] : 0.f;
  59. const int maxk = kernel_w * kernel_h;
  60. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  61. // depth-wise
  62. if (channels == group && group == num_output)
  63. {
  64. // pack1
  65. if (num_output % 4 != 0)
  66. {
  67. pipeline_convolutiondepthwise = new Pipeline(vkdev);
  68. pipeline_convolutiondepthwise->set_optimal_local_size_xyz(32, 32, num_output);
  69. pipeline_convolutiondepthwise->create("convolutiondepthwise", opt, specializations, 4, 10);
  70. }
  71. // pack4
  72. if (num_output % 4 == 0)
  73. {
  74. pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev);
  75. pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 4));
  76. pipeline_convolutiondepthwise_pack4->create("convolutiondepthwise_pack4", opt, specializations, 4, 10);
  77. }
  78. return 0;
  79. }
  80. // group convolution
  81. const int channels_g = channels / group;
  82. const int num_output_g = num_output / group;
  83. // pack1
  84. if (channels_g % 4 != 0 && num_output_g % 4 != 0)
  85. {
  86. pipeline_convolutiondepthwise_group = new Pipeline(vkdev);
  87. pipeline_convolutiondepthwise_group->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
  88. pipeline_convolutiondepthwise_group->create("convolutiondepthwise_group", opt, specializations, 4, 10);
  89. }
  90. // pack4
  91. if (channels_g % 4 == 0 && num_output_g % 4 == 0)
  92. {
  93. pipeline_convolutiondepthwise_group_pack4 = new Pipeline(vkdev);
  94. pipeline_convolutiondepthwise_group_pack4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
  95. pipeline_convolutiondepthwise_group_pack4->create("convolutiondepthwise_group_pack4", opt, specializations, 4, 10);
  96. }
  97. // pack1to4
  98. if (channels_g % 4 != 0 && num_output_g % 4 == 0)
  99. {
  100. pipeline_convolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
  101. pipeline_convolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
  102. pipeline_convolutiondepthwise_group_pack1to4->create("convolutiondepthwise_group_pack1to4", opt, specializations, 4, 10);
  103. }
  104. // pack4to1
  105. if (channels_g % 4 == 0 && num_output_g % 4 != 0)
  106. {
  107. pipeline_convolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
  108. pipeline_convolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
  109. pipeline_convolutiondepthwise_group_pack4to1->create("convolutiondepthwise_group_pack4to1", opt, specializations, 4, 10);
  110. }
  111. if (channels % 4 == 0 && channels_g % 4 != 0)
  112. {
  113. packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing);
  114. packing_pack1->vkdev = vkdev;
  115. ncnn::ParamDict pd;
  116. pd.set(0, 1);
  117. packing_pack1->load_param(pd);
  118. packing_pack1->create_pipeline(opt);
  119. }
  120. if (num_output_g % 4 != 0 && num_output % 4 == 0)
  121. {
  122. packing_pack4 = ncnn::create_layer(ncnn::LayerType::Packing);
  123. packing_pack4->vkdev = vkdev;
  124. ncnn::ParamDict pd;
  125. pd.set(0, 4);
  126. packing_pack4->load_param(pd);
  127. packing_pack4->create_pipeline(opt);
  128. }
  129. return 0;
  130. }
  131. int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
  132. {
  133. if (padding)
  134. {
  135. padding->destroy_pipeline(opt);
  136. delete padding;
  137. padding = 0;
  138. }
  139. if (packing_pack1)
  140. {
  141. packing_pack1->destroy_pipeline(opt);
  142. delete packing_pack1;
  143. packing_pack1 = 0;
  144. }
  145. if (packing_pack4)
  146. {
  147. packing_pack4->destroy_pipeline(opt);
  148. delete packing_pack4;
  149. packing_pack4 = 0;
  150. }
  151. delete pipeline_convolutiondepthwise;
  152. pipeline_convolutiondepthwise = 0;
  153. delete pipeline_convolutiondepthwise_pack4;
  154. pipeline_convolutiondepthwise_pack4 = 0;
  155. delete pipeline_convolutiondepthwise_group;
  156. pipeline_convolutiondepthwise_group = 0;
  157. delete pipeline_convolutiondepthwise_group_pack4;
  158. pipeline_convolutiondepthwise_group_pack4 = 0;
  159. delete pipeline_convolutiondepthwise_group_pack1to4;
  160. pipeline_convolutiondepthwise_group_pack1to4 = 0;
  161. delete pipeline_convolutiondepthwise_group_pack4to1;
  162. pipeline_convolutiondepthwise_group_pack4to1 = 0;
  163. return 0;
  164. }
  165. int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
  166. {
  167. const int maxk = kernel_w * kernel_h;
  168. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  169. // depth-wise
  170. if (channels == group && group == num_output)
  171. {
  172. // pack1
  173. if (num_output % 4 != 0)
  174. {
  175. cmd.record_upload(weight_data, weight_data_gpu, opt);
  176. }
  177. // pack4
  178. if (num_output % 4 == 0)
  179. {
  180. Mat weight_data_pack4;
  181. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  182. convert_packing(weight_data_r2, weight_data_pack4, 4);
  183. cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4, opt);
  184. }
  185. if (bias_term)
  186. {
  187. if (num_output % 4 != 0)
  188. {
  189. cmd.record_upload(bias_data, bias_data_gpu, opt);
  190. }
  191. if (num_output % 4 == 0)
  192. {
  193. Mat bias_data_pack4;
  194. convert_packing(bias_data, bias_data_pack4, 4);
  195. cmd.record_upload(bias_data_pack4, bias_data_gpu_pack4, opt);
  196. }
  197. }
  198. return 0;
  199. }
  200. // group convolution
  201. const int channels_g = channels / group;
  202. const int num_output_g = num_output / group;
  203. // pack1
  204. if (channels_g % 4 != 0 && num_output_g % 4 != 0)
  205. {
  206. cmd.record_upload(weight_data, weight_data_gpu, opt);
  207. }
  208. // pack4
  209. if (channels_g % 4 == 0 && num_output_g % 4 == 0)
  210. {
  211. // src = kw-kh-inch-outch
  212. // dst = 4a-4b-kw-kh-inch/4a-outch/4b
  213. Mat weight_data_pack4_groups;
  214. {
  215. Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);
  216. weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16);
  217. for (int g=0; g<group; g++)
  218. {
  219. const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
  220. Mat weight_data_pack4 = weight_data_pack4_groups.channel_range(num_output_g/4 * g, num_output_g/4);
  221. for (int q=0; q+3<num_output_g; q+=4)
  222. {
  223. const Mat k0 = weight_data_r2.channel(q);
  224. const Mat k1 = weight_data_r2.channel(q+1);
  225. const Mat k2 = weight_data_r2.channel(q+2);
  226. const Mat k3 = weight_data_r2.channel(q+3);
  227. Mat g0 = weight_data_pack4.channel(q/4);
  228. for (int p=0; p+3<channels_g; p+=4)
  229. {
  230. const float* k00 = k0.row(p);
  231. const float* k01 = k0.row(p+1);
  232. const float* k02 = k0.row(p+2);
  233. const float* k03 = k0.row(p+3);
  234. const float* k10 = k1.row(p);
  235. const float* k11 = k1.row(p+1);
  236. const float* k12 = k1.row(p+2);
  237. const float* k13 = k1.row(p+3);
  238. const float* k20 = k2.row(p);
  239. const float* k21 = k2.row(p+1);
  240. const float* k22 = k2.row(p+2);
  241. const float* k23 = k2.row(p+3);
  242. const float* k30 = k3.row(p);
  243. const float* k31 = k3.row(p+1);
  244. const float* k32 = k3.row(p+2);
  245. const float* k33 = k3.row(p+3);
  246. float* g00 = g0.row(p/4);
  247. for (int k=0; k<maxk; k++)
  248. {
  249. g00[0] = k00[k];
  250. g00[1] = k01[k];
  251. g00[2] = k02[k];
  252. g00[3] = k03[k];
  253. g00[4] = k10[k];
  254. g00[5] = k11[k];
  255. g00[6] = k12[k];
  256. g00[7] = k13[k];
  257. g00[8] = k20[k];
  258. g00[9] = k21[k];
  259. g00[10] = k22[k];
  260. g00[11] = k23[k];
  261. g00[12] = k30[k];
  262. g00[13] = k31[k];
  263. g00[14] = k32[k];
  264. g00[15] = k33[k];
  265. g00 += 16;
  266. }
  267. }
  268. }
  269. }
  270. }
  271. cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4, opt);
  272. }
  273. // pack1to4
  274. if (channels_g % 4 != 0 && num_output_g % 4 == 0)
  275. {
  276. // src = kw-kh-inch-outch
  277. // dst = 4b-kw-kh-inch-outch/4b
  278. Mat weight_data_pack1to4_groups;
  279. {
  280. Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);
  281. weight_data_pack1to4_groups.create(maxk, channels_g, num_output_g/4 * group, (size_t)4*4, 4);
  282. for (int g=0; g<group; g++)
  283. {
  284. const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
  285. Mat weight_data_pack1to4 = weight_data_pack1to4_groups.channel_range(num_output_g/4 * g, num_output_g/4);
  286. for (int q=0; q+3<num_output_g; q+=4)
  287. {
  288. const Mat k0 = weight_data_r2.channel(q);
  289. const Mat k1 = weight_data_r2.channel(q+1);
  290. const Mat k2 = weight_data_r2.channel(q+2);
  291. const Mat k3 = weight_data_r2.channel(q+3);
  292. Mat g0 = weight_data_pack1to4.channel(q/4);
  293. for (int p=0; p<channels_g; p++)
  294. {
  295. const float* k00 = k0.row(p);
  296. const float* k10 = k1.row(p);
  297. const float* k20 = k2.row(p);
  298. const float* k30 = k3.row(p);
  299. float* g00 = g0.row(p);
  300. for (int k=0; k<maxk; k++)
  301. {
  302. g00[0] = k00[k];
  303. g00[1] = k10[k];
  304. g00[2] = k20[k];
  305. g00[3] = k30[k];
  306. g00 += 4;
  307. }
  308. }
  309. }
  310. }
  311. }
  312. cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4, opt);
  313. }
  314. // pack4to1
  315. if (channels_g % 4 == 0 && num_output_g % 4 != 0)
  316. {
  317. // src = kw-kh-inch-outch
  318. // dst = 4a-kw-kh-inch/4a-outch
  319. Mat weight_data_pack4to1_groups;
  320. {
  321. Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);
  322. weight_data_pack4to1_groups.create(maxk, channels_g/4, num_output_g * group, (size_t)4*4, 4);
  323. for (int g=0; g<group; g++)
  324. {
  325. const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
  326. Mat weight_data_pack4to1 = weight_data_pack4to1_groups.channel_range(num_output_g * g, num_output_g);
  327. for (int q=0; q<num_output_g; q++)
  328. {
  329. const Mat k0 = weight_data_r2.channel(q);
  330. Mat g0 = weight_data_pack4to1.channel(q);
  331. for (int p=0; p+3<channels_g; p+=4)
  332. {
  333. const float* k00 = k0.row(p);
  334. const float* k01 = k0.row(p+1);
  335. const float* k02 = k0.row(p+2);
  336. const float* k03 = k0.row(p+3);
  337. float* g00 = g0.row(p/4);
  338. for (int k=0; k<maxk; k++)
  339. {
  340. g00[0] = k00[k];
  341. g00[1] = k01[k];
  342. g00[2] = k02[k];
  343. g00[3] = k03[k];
  344. g00 += 4;
  345. }
  346. }
  347. }
  348. }
  349. }
  350. cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1, opt);
  351. }
  352. if (bias_term)
  353. {
  354. if (num_output_g % 4 != 0)
  355. {
  356. cmd.record_upload(bias_data, bias_data_gpu, opt);
  357. }
  358. if (num_output_g % 4 == 0)
  359. {
  360. Mat bias_data_pack4;
  361. convert_packing(bias_data, bias_data_pack4, 4);
  362. cmd.record_upload(bias_data_pack4, bias_data_gpu_pack4, opt);
  363. }
  364. }
  365. return 0;
  366. }
  367. int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
  368. {
  369. int w = bottom_blob.w;
  370. int h = bottom_blob.h;
  371. int channels = bottom_blob.c;
  372. size_t elemsize = bottom_blob.elemsize;
  373. int elempack = bottom_blob.elempack;
  374. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  375. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  376. VkMat bottom_blob_bordered = bottom_blob;
  377. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
  378. {
  379. ncnn::Option opt_pad = opt;
  380. opt_pad.blob_vkallocator = opt.workspace_vkallocator;
  381. padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
  382. }
  383. else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
  384. {
  385. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  386. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  387. if (wpad > 0 || hpad > 0)
  388. {
  389. ncnn::Option opt_pad = opt;
  390. opt_pad.blob_vkallocator = opt.workspace_vkallocator;
  391. VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
  392. padding_param_blob.prepare_staging_buffer();
  393. int* padding_params = padding_param_blob.mapped();
  394. padding_params[0] = hpad / 2;
  395. padding_params[1] = hpad - hpad / 2;
  396. padding_params[2] = wpad / 2;
  397. padding_params[3] = wpad - wpad / 2;
  398. std::vector<VkMat> padding_inputs(2);
  399. padding_inputs[0] = bottom_blob;
  400. padding_inputs[1] = padding_param_blob;
  401. std::vector<VkMat> padding_outputs(1);
  402. padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
  403. bottom_blob_bordered = padding_outputs[0];
  404. }
  405. }
  406. else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
  407. {
  408. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  409. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  410. if (wpad > 0 || hpad > 0)
  411. {
  412. ncnn::Option opt_pad = opt;
  413. opt_pad.blob_vkallocator = opt.workspace_vkallocator;
  414. VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
  415. padding_param_blob.prepare_staging_buffer();
  416. int* padding_params = padding_param_blob.mapped();
  417. padding_params[0] = hpad - hpad / 2;
  418. padding_params[1] = hpad / 2;
  419. padding_params[2] = wpad - wpad / 2;
  420. padding_params[3] = wpad / 2;
  421. std::vector<VkMat> padding_inputs(2);
  422. padding_inputs[0] = bottom_blob;
  423. padding_inputs[1] = padding_param_blob;
  424. std::vector<VkMat> padding_outputs(1);
  425. padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
  426. bottom_blob_bordered = padding_outputs[0];
  427. }
  428. }
  429. w = bottom_blob_bordered.w;
  430. h = bottom_blob_bordered.h;
  431. int outw = (w - kernel_extent_w) / stride_w + 1;
  432. int outh = (h - kernel_extent_h) / stride_h + 1;
  433. int out_elempack = num_output % 4 == 0 ? 4 : 1;
  434. size_t out_elemsize = elemsize / elempack * out_elempack;
  435. if (opt.use_fp16_packed && !opt.use_fp16_storage)
  436. {
  437. if (out_elempack == 4) out_elemsize = 4*2u;
  438. if (out_elempack == 1) out_elemsize = 4u;
  439. }
  440. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
  441. if (top_blob.empty())
  442. return -100;
  443. // depth-wise
  444. if (channels == group / elempack && group / elempack == num_output / elempack)
  445. {
  446. std::vector<VkMat> bindings(4);
  447. bindings[0] = bottom_blob_bordered;
  448. bindings[1] = top_blob;
  449. bindings[2] = elempack == 4 ? weight_data_gpu_pack4 : weight_data_gpu;
  450. bindings[3] = bias_term ? (elempack == 4 ? bias_data_gpu_pack4 : bias_data_gpu) : bindings[2];// TODO use dummy buffer
  451. std::vector<vk_constant_type> constants(10);
  452. constants[0].i = bottom_blob_bordered.dims;
  453. constants[1].i = bottom_blob_bordered.w;
  454. constants[2].i = bottom_blob_bordered.h;
  455. constants[3].i = bottom_blob_bordered.c;
  456. constants[4].i = bottom_blob_bordered.cstep;
  457. constants[5].i = top_blob.dims;
  458. constants[6].i = top_blob.w;
  459. constants[7].i = top_blob.h;
  460. constants[8].i = top_blob.c;
  461. constants[9].i = top_blob.cstep;
  462. const Pipeline* pipeline = elempack == 4 ? pipeline_convolutiondepthwise_pack4 : pipeline_convolutiondepthwise;
  463. cmd.record_pipeline(pipeline, bindings, constants, top_blob);
  464. return 0;
  465. }
  466. const int channels_g = channels * elempack / group;
  467. const int num_output_g = num_output / group;
  468. // unpacking
  469. VkMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
  470. if (elempack == 4 && channels_g % 4 != 0)
  471. {
  472. ncnn::Option opt_pack1 = opt;
  473. opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
  474. packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1);
  475. }
  476. VkMat top_blob_unpacked = top_blob;
  477. if (num_output_g % 4 != 0 && out_elempack == 4)
  478. {
  479. top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
  480. if (top_blob_unpacked.empty())
  481. return -100;
  482. }
  483. std::vector<VkMat> bindings(4);
  484. bindings[0] = bottom_blob_bordered_unpacked;
  485. bindings[1] = top_blob_unpacked;
  486. if (channels_g % 4 != 0 && num_output_g % 4 != 0)
  487. {
  488. bindings[2] = weight_data_gpu;
  489. bindings[3] = bias_term ? bias_data_gpu : bindings[2];// TODO use dummy buffer
  490. }
  491. else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
  492. {
  493. bindings[2] = weight_data_gpu_pack4;
  494. bindings[3] = bias_term ? bias_data_gpu_pack4 : bindings[2];// TODO use dummy buffer
  495. }
  496. else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
  497. {
  498. bindings[2] = weight_data_gpu_pack1to4;
  499. bindings[3] = bias_term ? bias_data_gpu_pack4 : bindings[2];// TODO use dummy buffer
  500. }
  501. else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
  502. {
  503. bindings[2] = weight_data_gpu_pack4to1;
  504. bindings[3] = bias_term ? bias_data_gpu : bindings[2];// TODO use dummy buffer
  505. }
  506. std::vector<vk_constant_type> constants(10);
  507. constants[0].i = bottom_blob_bordered_unpacked.dims;
  508. constants[1].i = bottom_blob_bordered_unpacked.w;
  509. constants[2].i = bottom_blob_bordered_unpacked.h;
  510. constants[3].i = bottom_blob_bordered_unpacked.c;
  511. constants[4].i = bottom_blob_bordered_unpacked.cstep;
  512. constants[5].i = top_blob_unpacked.dims;
  513. constants[6].i = top_blob_unpacked.w;
  514. constants[7].i = top_blob_unpacked.h;
  515. constants[8].i = top_blob_unpacked.c;
  516. constants[9].i = top_blob_unpacked.cstep;
  517. const Pipeline* pipeline = 0;
  518. if (channels_g % 4 != 0 && num_output_g % 4 != 0)
  519. {
  520. pipeline = pipeline_convolutiondepthwise_group;
  521. }
  522. else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
  523. {
  524. pipeline = pipeline_convolutiondepthwise_group_pack4;
  525. }
  526. else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
  527. {
  528. pipeline = pipeline_convolutiondepthwise_group_pack1to4;
  529. }
  530. else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
  531. {
  532. pipeline = pipeline_convolutiondepthwise_group_pack4to1;
  533. }
  534. cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
  535. // packing
  536. if (num_output_g % 4 != 0 && out_elempack == 4)
  537. {
  538. packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt);
  539. }
  540. else
  541. {
  542. top_blob = top_blob_unpacked;
  543. }
  544. return 0;
  545. }
  546. } // namespace ncnn