You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

slice_arm.cpp 22 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "slice_arm.h"
  15. #include "layer_type.h"
  16. #if __ARM_NEON
  17. #include <arm_neon.h>
  18. #endif // __ARM_NEON
  19. namespace ncnn {
  20. DEFINE_LAYER_CREATOR(Slice_arm)
  21. Slice_arm::Slice_arm()
  22. {
  23. #if __ARM_NEON
  24. support_packing = true;
  25. packing_pack1 = 0;
  26. #endif // __ARM_NEON
  27. support_bf16_storage = true;
  28. }
  29. int Slice_arm::create_pipeline(const Option& opt)
  30. {
  31. #if __ARM_NEON
  32. if (opt.use_packing_layout)
  33. {
  34. {
  35. packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing);
  36. ncnn::ParamDict pd;
  37. pd.set(0, 1);
  38. packing_pack1->load_param(pd);
  39. packing_pack1->create_pipeline(opt);
  40. }
  41. }
  42. #endif // __ARM_NEON
  43. return 0;
  44. }
  45. int Slice_arm::destroy_pipeline(const Option& opt)
  46. {
  47. #if __ARM_NEON
  48. if (opt.use_packing_layout)
  49. {
  50. if (packing_pack1)
  51. {
  52. packing_pack1->destroy_pipeline(opt);
  53. delete packing_pack1;
  54. packing_pack1 = 0;
  55. }
  56. }
  57. #endif // __ARM_NEON
  58. return 0;
  59. }
  60. int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
  61. {
  62. if (opt.use_bf16_storage)
  63. return forward_bf16s(bottom_blobs, top_blobs, opt);
  64. const Mat& bottom_blob = bottom_blobs[0];
  65. int dims = bottom_blob.dims;
  66. size_t elemsize = bottom_blob.elemsize;
  67. int elempack = bottom_blob.elempack;
  68. const int* slices_ptr = slices;
  69. #if __ARM_NEON
  70. if (opt.use_packing_layout)
  71. {
  72. if (dims == 1) // axis == 0
  73. {
  74. // slice vector
  75. int w = bottom_blob.w * elempack;
  76. int q = 0;
  77. for (size_t i=0; i<top_blobs.size(); i++)
  78. {
  79. int slice = slices_ptr[i];
  80. if (slice == -233)
  81. {
  82. slice = (w - q) / (top_blobs.size() - i);
  83. }
  84. int out_elempack = slice % 4 == 0 ? 4 : 1;
  85. size_t out_elemsize = elemsize / elempack * out_elempack;
  86. Mat& top_blob = top_blobs[i];
  87. top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  88. if (top_blob.empty())
  89. return -100;
  90. const float* ptr = (const float*)bottom_blob + q;
  91. float* outptr = top_blob;
  92. memcpy(outptr, ptr, top_blob.w * top_blob.elemsize);
  93. q += slice;
  94. }
  95. return 0;
  96. }
  97. if (dims == 2 && axis == 0)
  98. {
  99. // slice image height
  100. int w = bottom_blob.w;
  101. int h = bottom_blob.h * elempack;
  102. int q = 0;
  103. for (size_t i=0; i<top_blobs.size(); i++)
  104. {
  105. int slice = slices_ptr[i];
  106. if (slice == -233)
  107. {
  108. slice = (h - q) / (top_blobs.size() - i);
  109. }
  110. int out_elempack = slice % 4 == 0 ? 4 : 1;
  111. size_t out_elemsize = elemsize / elempack * out_elempack;
  112. Mat& top_blob = top_blobs[i];
  113. top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  114. if (top_blob.empty())
  115. return -100;
  116. q += slice;
  117. }
  118. size_t out_elemsize = top_blobs[0].elemsize;
  119. int out_elempack = top_blobs[0].elempack;
  120. for (size_t i=0; i<top_blobs.size(); i++)
  121. {
  122. out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
  123. out_elempack = std::min(out_elempack, top_blobs[i].elempack);
  124. }
  125. Mat bottom_blob_unpacked = bottom_blob;
  126. if (elempack == 4 && out_elempack == 1)
  127. {
  128. packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt);
  129. }
  130. const float* ptr = bottom_blob_unpacked;
  131. for (size_t i=0; i<top_blobs.size(); i++)
  132. {
  133. Mat& top_blob = top_blobs[i];
  134. if (out_elempack == 1 && top_blob.elempack == 4)
  135. {
  136. for (int j=0; j<top_blob.h; j++)
  137. {
  138. const float* r0 = ptr;
  139. const float* r1 = ptr + w;
  140. const float* r2 = ptr + w*2;
  141. const float* r3 = ptr + w*3;
  142. float* outptr0 = top_blob.row(j);
  143. for (int j=0; j<w; j++)
  144. {
  145. outptr0[0] = *r0++;
  146. outptr0[1] = *r1++;
  147. outptr0[2] = *r2++;
  148. outptr0[3] = *r3++;
  149. outptr0 += 4;
  150. }
  151. ptr += w * 4;
  152. }
  153. }
  154. else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
  155. {
  156. int size = w * top_blob.h;
  157. float* outptr = top_blob;
  158. memcpy(outptr, ptr, size * top_blob.elemsize);
  159. ptr += size * top_blob.elempack;
  160. }
  161. }
  162. return 0;
  163. }
  164. if (dims == 2 && axis == 1)
  165. {
  166. // slice image width
  167. int w = bottom_blob.w;
  168. int h = bottom_blob.h;
  169. int q = 0;
  170. for (size_t i=0; i<top_blobs.size(); i++)
  171. {
  172. int slice = slices_ptr[i];
  173. if (slice == -233)
  174. {
  175. slice = (w - q) / (top_blobs.size() - i);
  176. }
  177. Mat& top_blob = top_blobs[i];
  178. top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator);
  179. if (top_blob.empty())
  180. return -100;
  181. q += slice;
  182. }
  183. #pragma omp parallel for num_threads(opt.num_threads)
  184. for (int j=0; j<h; j++)
  185. {
  186. const float* ptr = bottom_blob.row(j);
  187. for (size_t i=0; i<top_blobs.size(); i++)
  188. {
  189. Mat& top_blob = top_blobs[i];
  190. float* outptr = top_blob.row(j);
  191. memcpy(outptr, ptr, top_blob.w * elemsize);
  192. ptr += top_blob.w * elempack;
  193. }
  194. }
  195. return 0;
  196. }
  197. if (dims == 3 && axis == 0)
  198. {
  199. // slice dim channel
  200. int w = bottom_blob.w;
  201. int h = bottom_blob.h;
  202. int channels = bottom_blob.c * elempack;
  203. int q = 0;
  204. for (size_t i=0; i<top_blobs.size(); i++)
  205. {
  206. int slice = slices_ptr[i];
  207. if (slice == -233)
  208. {
  209. slice = (channels - q) / (top_blobs.size() - i);
  210. }
  211. int out_elempack = slice % 4 == 0 ? 4 : 1;
  212. size_t out_elemsize = elemsize / elempack * out_elempack;
  213. Mat& top_blob = top_blobs[i];
  214. top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  215. if (top_blob.empty())
  216. return -100;
  217. q += slice;
  218. }
  219. size_t out_elemsize = top_blobs[0].elemsize;
  220. int out_elempack = top_blobs[0].elempack;
  221. for (size_t i=0; i<top_blobs.size(); i++)
  222. {
  223. out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
  224. out_elempack = std::min(out_elempack, top_blobs[i].elempack);
  225. }
  226. Mat bottom_blob_unpacked = bottom_blob;
  227. if (elempack == 4 && out_elempack == 1)
  228. {
  229. packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt);
  230. }
  231. int p = 0;
  232. for (size_t i=0; i<top_blobs.size(); i++)
  233. {
  234. Mat& top_blob = top_blobs[i];
  235. if (out_elempack == 1 && top_blob.elempack == 4)
  236. {
  237. int size = top_blob.w * top_blob.h;
  238. for (int q=0; q<top_blob.c; q++)
  239. {
  240. const float* r0 = bottom_blob_unpacked.channel(p);
  241. const float* r1 = bottom_blob_unpacked.channel(p+1);
  242. const float* r2 = bottom_blob_unpacked.channel(p+2);
  243. const float* r3 = bottom_blob_unpacked.channel(p+3);
  244. float* outptr0 = top_blob.channel(q);
  245. for (int j=0; j<size; j++)
  246. {
  247. outptr0[0] = *r0++;
  248. outptr0[1] = *r1++;
  249. outptr0[2] = *r2++;
  250. outptr0[3] = *r3++;
  251. outptr0 += 4;
  252. }
  253. p += 4;
  254. }
  255. }
  256. else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
  257. {
  258. int size = top_blob.total();
  259. const float* ptr = bottom_blob_unpacked.channel(p);
  260. float* outptr = top_blob;
  261. memcpy(outptr, ptr, size * top_blob.elemsize);
  262. p += top_blob.c;
  263. }
  264. }
  265. return 0;
  266. }
  267. if (dims == 3 && axis == 1)
  268. {
  269. // slice dim height
  270. int w = bottom_blob.w;
  271. int h = bottom_blob.h;
  272. int channels = bottom_blob.c;
  273. int q = 0;
  274. for (size_t i=0; i<top_blobs.size(); i++)
  275. {
  276. int slice = slices_ptr[i];
  277. if (slice == -233)
  278. {
  279. slice = (h - q) / (top_blobs.size() - i);
  280. }
  281. Mat& top_blob = top_blobs[i];
  282. top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator);
  283. if (top_blob.empty())
  284. return -100;
  285. q += slice;
  286. }
  287. #pragma omp parallel for num_threads(opt.num_threads)
  288. for (int p=0; p<channels; p++)
  289. {
  290. const float* ptr = bottom_blob.channel(p);
  291. for (size_t i=0; i<top_blobs.size(); i++)
  292. {
  293. Mat& top_blob = top_blobs[i];
  294. int size = top_blob.w * top_blob.h;
  295. float* outptr = top_blob.channel(p);
  296. memcpy(outptr, ptr, size * elemsize);
  297. ptr += size * elempack;
  298. }
  299. }
  300. return 0;
  301. }
  302. if (dims == 3 && axis == 2)
  303. {
  304. // slice dim width
  305. int w = bottom_blob.w;
  306. int h = bottom_blob.h;
  307. int channels = bottom_blob.c;
  308. int q = 0;
  309. for (size_t i=0; i<top_blobs.size(); i++)
  310. {
  311. int slice = slices_ptr[i];
  312. if (slice == -233)
  313. {
  314. slice = (w - q) / (top_blobs.size() - i);
  315. }
  316. Mat& top_blob = top_blobs[i];
  317. top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator);
  318. if (top_blob.empty())
  319. return -100;
  320. q += slice;
  321. }
  322. #pragma omp parallel for num_threads(opt.num_threads)
  323. for (int p=0; p<channels; p++)
  324. {
  325. const float* ptr = bottom_blob.channel(p);
  326. for (int j=0; j<h; j++)
  327. {
  328. for (size_t i=0; i<top_blobs.size(); i++)
  329. {
  330. Mat& top_blob = top_blobs[i];
  331. float* outptr = top_blob.channel(p).row(j);
  332. memcpy(outptr, ptr, top_blob.w * elemsize);
  333. ptr += top_blob.w * elempack;
  334. }
  335. }
  336. }
  337. return 0;
  338. }
  339. } // opt.use_packing_layout
  340. #endif // __ARM_NEON
  341. return Slice::forward(bottom_blobs, top_blobs, opt);
  342. }
  343. int Slice_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
  344. {
  345. const Mat& bottom_blob = bottom_blobs[0];
  346. int dims = bottom_blob.dims;
  347. size_t elemsize = bottom_blob.elemsize;
  348. int elempack = bottom_blob.elempack;
  349. const int* slices_ptr = slices;
  350. #if __ARM_NEON
  351. if (opt.use_packing_layout)
  352. {
  353. if (dims == 1) // axis == 0
  354. {
  355. // slice vector
  356. int w = bottom_blob.w * elempack;
  357. int q = 0;
  358. for (size_t i=0; i<top_blobs.size(); i++)
  359. {
  360. int slice = slices_ptr[i];
  361. if (slice == -233)
  362. {
  363. slice = (w - q) / (top_blobs.size() - i);
  364. }
  365. int out_elempack = slice % 4 == 0 ? 4 : 1;
  366. size_t out_elemsize = elemsize / elempack * out_elempack;
  367. Mat& top_blob = top_blobs[i];
  368. top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  369. if (top_blob.empty())
  370. return -100;
  371. const unsigned short* ptr = (const unsigned short*)bottom_blob + q;
  372. unsigned short* outptr = top_blob;
  373. memcpy(outptr, ptr, top_blob.w * top_blob.elemsize);
  374. q += slice;
  375. }
  376. return 0;
  377. }
  378. if (dims == 2 && axis == 0)
  379. {
  380. // slice image height
  381. int w = bottom_blob.w;
  382. int h = bottom_blob.h * elempack;
  383. int q = 0;
  384. for (size_t i=0; i<top_blobs.size(); i++)
  385. {
  386. int slice = slices_ptr[i];
  387. if (slice == -233)
  388. {
  389. slice = (h - q) / (top_blobs.size() - i);
  390. }
  391. int out_elempack = slice % 4 == 0 ? 4 : 1;
  392. size_t out_elemsize = elemsize / elempack * out_elempack;
  393. Mat& top_blob = top_blobs[i];
  394. top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  395. if (top_blob.empty())
  396. return -100;
  397. q += slice;
  398. }
  399. size_t out_elemsize = top_blobs[0].elemsize;
  400. int out_elempack = top_blobs[0].elempack;
  401. for (size_t i=0; i<top_blobs.size(); i++)
  402. {
  403. out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
  404. out_elempack = std::min(out_elempack, top_blobs[i].elempack);
  405. }
  406. Mat bottom_blob_unpacked = bottom_blob;
  407. if (elempack == 4 && out_elempack == 1)
  408. {
  409. packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt);
  410. }
  411. const unsigned short* ptr = bottom_blob_unpacked;
  412. for (size_t i=0; i<top_blobs.size(); i++)
  413. {
  414. Mat& top_blob = top_blobs[i];
  415. if (out_elempack == 1 && top_blob.elempack == 4)
  416. {
  417. for (int j=0; j<top_blob.h; j++)
  418. {
  419. const unsigned short* r0 = ptr;
  420. const unsigned short* r1 = ptr + w;
  421. const unsigned short* r2 = ptr + w*2;
  422. const unsigned short* r3 = ptr + w*3;
  423. unsigned short* outptr0 = top_blob.row<unsigned short>(j);
  424. for (int j=0; j<w; j++)
  425. {
  426. outptr0[0] = *r0++;
  427. outptr0[1] = *r1++;
  428. outptr0[2] = *r2++;
  429. outptr0[3] = *r3++;
  430. outptr0 += 4;
  431. }
  432. ptr += w * 4;
  433. }
  434. }
  435. else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
  436. {
  437. int size = w * top_blob.h;
  438. unsigned short* outptr = top_blob;
  439. memcpy(outptr, ptr, size * top_blob.elemsize);
  440. ptr += size * top_blob.elempack;
  441. }
  442. }
  443. return 0;
  444. }
  445. if (dims == 2 && axis == 1)
  446. {
  447. // slice image width
  448. int w = bottom_blob.w;
  449. int h = bottom_blob.h;
  450. int q = 0;
  451. for (size_t i=0; i<top_blobs.size(); i++)
  452. {
  453. int slice = slices_ptr[i];
  454. if (slice == -233)
  455. {
  456. slice = (w - q) / (top_blobs.size() - i);
  457. }
  458. Mat& top_blob = top_blobs[i];
  459. top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator);
  460. if (top_blob.empty())
  461. return -100;
  462. q += slice;
  463. }
  464. #pragma omp parallel for num_threads(opt.num_threads)
  465. for (int j=0; j<h; j++)
  466. {
  467. const unsigned short* ptr = bottom_blob.row<const unsigned short>(j);
  468. for (size_t i=0; i<top_blobs.size(); i++)
  469. {
  470. Mat& top_blob = top_blobs[i];
  471. unsigned short* outptr = top_blob.row<unsigned short>(j);
  472. memcpy(outptr, ptr, top_blob.w * elemsize);
  473. ptr += top_blob.w * elempack;
  474. }
  475. }
  476. return 0;
  477. }
  478. if (dims == 3 && axis == 0)
  479. {
  480. // slice dim channel
  481. int w = bottom_blob.w;
  482. int h = bottom_blob.h;
  483. int channels = bottom_blob.c * elempack;
  484. int q = 0;
  485. for (size_t i=0; i<top_blobs.size(); i++)
  486. {
  487. int slice = slices_ptr[i];
  488. if (slice == -233)
  489. {
  490. slice = (channels - q) / (top_blobs.size() - i);
  491. }
  492. int out_elempack = slice % 4 == 0 ? 4 : 1;
  493. size_t out_elemsize = elemsize / elempack * out_elempack;
  494. Mat& top_blob = top_blobs[i];
  495. top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  496. if (top_blob.empty())
  497. return -100;
  498. q += slice;
  499. }
  500. size_t out_elemsize = top_blobs[0].elemsize;
  501. int out_elempack = top_blobs[0].elempack;
  502. for (size_t i=0; i<top_blobs.size(); i++)
  503. {
  504. out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
  505. out_elempack = std::min(out_elempack, top_blobs[i].elempack);
  506. }
  507. Mat bottom_blob_unpacked = bottom_blob;
  508. if (elempack == 4 && out_elempack == 1)
  509. {
  510. packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt);
  511. }
  512. int p = 0;
  513. for (size_t i=0; i<top_blobs.size(); i++)
  514. {
  515. Mat& top_blob = top_blobs[i];
  516. if (out_elempack == 1 && top_blob.elempack == 4)
  517. {
  518. int size = top_blob.w * top_blob.h;
  519. for (int q=0; q<top_blob.c; q++)
  520. {
  521. const unsigned short* r0 = bottom_blob_unpacked.channel(p);
  522. const unsigned short* r1 = bottom_blob_unpacked.channel(p+1);
  523. const unsigned short* r2 = bottom_blob_unpacked.channel(p+2);
  524. const unsigned short* r3 = bottom_blob_unpacked.channel(p+3);
  525. unsigned short* outptr0 = top_blob.channel(q);
  526. for (int j=0; j<size; j++)
  527. {
  528. outptr0[0] = *r0++;
  529. outptr0[1] = *r1++;
  530. outptr0[2] = *r2++;
  531. outptr0[3] = *r3++;
  532. outptr0 += 4;
  533. }
  534. p += 4;
  535. }
  536. }
  537. else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
  538. {
  539. int size = top_blob.total();
  540. const unsigned short* ptr = bottom_blob_unpacked.channel(p);
  541. unsigned short* outptr = top_blob;
  542. memcpy(outptr, ptr, size * top_blob.elemsize);
  543. p += top_blob.c;
  544. }
  545. }
  546. return 0;
  547. }
  548. if (dims == 3 && axis == 1)
  549. {
  550. // slice dim height
  551. int w = bottom_blob.w;
  552. int h = bottom_blob.h;
  553. int channels = bottom_blob.c;
  554. int q = 0;
  555. for (size_t i=0; i<top_blobs.size(); i++)
  556. {
  557. int slice = slices_ptr[i];
  558. if (slice == -233)
  559. {
  560. slice = (h - q) / (top_blobs.size() - i);
  561. }
  562. Mat& top_blob = top_blobs[i];
  563. top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator);
  564. if (top_blob.empty())
  565. return -100;
  566. q += slice;
  567. }
  568. #pragma omp parallel for num_threads(opt.num_threads)
  569. for (int p=0; p<channels; p++)
  570. {
  571. const unsigned short* ptr = bottom_blob.channel(p);
  572. for (size_t i=0; i<top_blobs.size(); i++)
  573. {
  574. Mat& top_blob = top_blobs[i];
  575. int size = top_blob.w * top_blob.h;
  576. unsigned short* outptr = top_blob.channel(p);
  577. memcpy(outptr, ptr, size * elemsize);
  578. ptr += size * elempack;
  579. }
  580. }
  581. return 0;
  582. }
  583. if (dims == 3 && axis == 2)
  584. {
  585. // slice dim width
  586. int w = bottom_blob.w;
  587. int h = bottom_blob.h;
  588. int channels = bottom_blob.c;
  589. int q = 0;
  590. for (size_t i=0; i<top_blobs.size(); i++)
  591. {
  592. int slice = slices_ptr[i];
  593. if (slice == -233)
  594. {
  595. slice = (w - q) / (top_blobs.size() - i);
  596. }
  597. Mat& top_blob = top_blobs[i];
  598. top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator);
  599. if (top_blob.empty())
  600. return -100;
  601. q += slice;
  602. }
  603. #pragma omp parallel for num_threads(opt.num_threads)
  604. for (int p=0; p<channels; p++)
  605. {
  606. const unsigned short* ptr = bottom_blob.channel(p);
  607. for (int j=0; j<h; j++)
  608. {
  609. for (size_t i=0; i<top_blobs.size(); i++)
  610. {
  611. Mat& top_blob = top_blobs[i];
  612. unsigned short* outptr = top_blob.channel(p).row<unsigned short>(j);
  613. memcpy(outptr, ptr, top_blob.w * elemsize);
  614. ptr += top_blob.w * elempack;
  615. }
  616. }
  617. }
  618. return 0;
  619. }
  620. } // opt.use_packing_layout
  621. #endif // __ARM_NEON
  622. return Slice::forward(bottom_blobs, top_blobs, opt);
  623. }
  624. } // namespace ncnn