You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat.cpp 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #if __ARM_NEON
  16. #include <arm_neon.h>
  17. #endif // __ARM_NEON
  18. #include <math.h>
  19. #include "cpu.h"
  20. #include "layer_type.h"
  21. #include "layer.h"
  22. namespace ncnn {
  23. void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_vals)
  24. {
  25. int size = w * h;
  26. if (mean_vals && !norm_vals)
  27. {
  28. // substract mean only
  29. #pragma omp parallel for
  30. for (int q=0; q<c; q++)
  31. {
  32. float* ptr = channel(q);//data + cstep * q;
  33. const float mean = mean_vals[q];
  34. #if __ARM_NEON
  35. int nn = size >> 2;
  36. int remain = size - (nn << 2);
  37. #else
  38. int remain = size;
  39. #endif // __ARM_NEON
  40. #if __ARM_NEON
  41. #if __aarch64__
  42. if (nn > 0)
  43. {
  44. asm volatile(
  45. "dup v1.4s, %w4 \n"
  46. "0: \n"
  47. "prfm pldl1keep, [%1, #128] \n"
  48. "ld1 {v0.4s}, [%1] \n"
  49. "fsub v0.4s, v0.4s, v1.4s \n"
  50. "subs %w0, %w0, #1 \n"
  51. "st1 {v0.4s}, [%1], #16 \n"
  52. "bne 0b \n"
  53. : "=r"(nn), // %0
  54. "=r"(ptr) // %1
  55. : "0"(nn),
  56. "1"(ptr),
  57. "r"(mean) // %4
  58. : "cc", "memory", "v0", "v1"
  59. );
  60. }
  61. #else
  62. if (nn > 0)
  63. {
  64. asm volatile(
  65. "vdup.f32 q1, %4 \n"
  66. "0: \n"
  67. "pld [%1, #128] \n"
  68. "vld1.f32 {d0-d1}, [%1 :128] \n"
  69. "vsub.f32 q0, q0, q1 \n"
  70. "subs %0, #1 \n"
  71. "vst1.f32 {d0-d1}, [%1 :128]! \n"
  72. "bne 0b \n"
  73. : "=r"(nn), // %0
  74. "=r"(ptr) // %1
  75. : "0"(nn),
  76. "1"(ptr),
  77. "r"(mean) // %4
  78. : "cc", "memory", "q0", "q1"
  79. );
  80. }
  81. #endif // __aarch64__
  82. #endif // __ARM_NEON
  83. for (; remain>0; remain--)
  84. {
  85. *ptr -= mean;
  86. ptr++;
  87. }
  88. }
  89. }
  90. else if (!mean_vals && norm_vals)
  91. {
  92. // normalize only
  93. #pragma omp parallel for
  94. for (int q=0; q<c; q++)
  95. {
  96. float* ptr = channel(q);//data + cstep * q;
  97. const float norm = norm_vals[q];
  98. #if __ARM_NEON
  99. int nn = size >> 2;
  100. int remain = size - (nn << 2);
  101. #else
  102. int remain = size;
  103. #endif // __ARM_NEON
  104. #if __ARM_NEON
  105. #if __aarch64__
  106. if (nn > 0)
  107. {
  108. asm volatile(
  109. "dup v1.4s, %w4 \n"
  110. "0: \n"
  111. "prfm pldl1keep, [%1, #128] \n"
  112. "ld1 {v0.4s}, [%1] \n"
  113. "fmul v0.4s, v0.4s, v1.4s \n"
  114. "subs %w0, %w0, #1 \n"
  115. "st1 {v0.4s}, [%1], #16 \n"
  116. "bne 0b \n"
  117. : "=r"(nn), // %0
  118. "=r"(ptr) // %1
  119. : "0"(nn),
  120. "1"(ptr),
  121. "r"(norm) // %4
  122. : "cc", "memory", "v0", "v1"
  123. );
  124. }
  125. #else
  126. if (nn > 0)
  127. {
  128. asm volatile(
  129. "vdup.f32 q1, %4 \n"
  130. "0: \n"
  131. "pld [%1, #128] \n"
  132. "vld1.f32 {d0-d1}, [%1 :128] \n"
  133. "vmul.f32 q0, q0, q1 \n"
  134. "subs %0, #1 \n"
  135. "vst1.f32 {d0-d1}, [%1 :128]! \n"
  136. "bne 0b \n"
  137. : "=r"(nn), // %0
  138. "=r"(ptr) // %1
  139. : "0"(nn),
  140. "1"(ptr),
  141. "r"(norm) // %4
  142. : "cc", "memory", "q0", "q1"
  143. );
  144. }
  145. #endif // __aarch64__
  146. #endif // __ARM_NEON
  147. for (; remain>0; remain--)
  148. {
  149. *ptr *= norm;
  150. ptr++;
  151. }
  152. }
  153. }
  154. else if (mean_vals && norm_vals)
  155. {
  156. // substract mean and normalize
  157. #pragma omp parallel for
  158. for (int q=0; q<c; q++)
  159. {
  160. float* ptr = channel(q);//data + cstep * q;
  161. const float mean = mean_vals[q];
  162. const float norm = norm_vals[q];
  163. #if __ARM_NEON
  164. int nn = size >> 2;
  165. int remain = size - (nn << 2);
  166. #else
  167. int remain = size;
  168. #endif // __ARM_NEON
  169. #if __ARM_NEON
  170. #if __aarch64__
  171. if (nn > 0)
  172. {
  173. asm volatile(
  174. "dup v1.4s, %w4 \n"
  175. "dup v2.4s, %w5 \n"
  176. "0: \n"
  177. "prfm pldl1keep, [%1, #128] \n"
  178. "ld1 {v0.4s}, [%1] \n"
  179. "fsub v0.4s, v0.4s, v1.4s \n"
  180. "fmul v0.4s, v0.4s, v2.4s \n"
  181. "subs %w0, %w0, #1 \n"
  182. "st1 {v0.4s}, [%1], #16 \n"
  183. "bne 0b \n"
  184. : "=r"(nn), // %0
  185. "=r"(ptr) // %1
  186. : "0"(nn),
  187. "1"(ptr),
  188. "r"(mean), // %4
  189. "r"(norm) // %5
  190. : "cc", "memory", "v0", "v1", "v2"
  191. );
  192. }
  193. #else
  194. if (nn > 0)
  195. {
  196. asm volatile(
  197. "vdup.f32 q1, %4 \n"
  198. "vdup.f32 q2, %5 \n"
  199. "0: \n"
  200. "pld [%1, #128] \n"
  201. "vld1.f32 {d0-d1}, [%1 :128] \n"
  202. "vsub.f32 q0, q0, q1 \n"
  203. "vmul.f32 q0, q0, q2 \n"
  204. "subs %0, #1 \n"
  205. "vst1.f32 {d0-d1}, [%1 :128]! \n"
  206. "bne 0b \n"
  207. : "=r"(nn), // %0
  208. "=r"(ptr) // %1
  209. : "0"(nn),
  210. "1"(ptr),
  211. "r"(mean), // %4
  212. "r"(norm) // %5
  213. : "cc", "memory", "q0", "q1", "q2"
  214. );
  215. }
  216. #endif // __aarch64__
  217. #endif // __ARM_NEON
  218. for (; remain>0; remain--)
  219. {
  220. *ptr = (*ptr - mean) * norm;
  221. ptr++;
  222. }
  223. }
  224. }
  225. }
  226. // convert half precision floating point to float
  227. static float half2float(unsigned short value)
  228. {
  229. // 1 : 5 : 10
  230. unsigned short sign = (value & 0x8000) >> 15;
  231. unsigned short exponent = (value & 0x7c00) >> 10;
  232. unsigned short significand = value & 0x03FF;
  233. // fprintf(stderr, "%d %d %d\n", sign, exponent, significand);
  234. // 1 : 8 : 23
  235. union
  236. {
  237. unsigned int u;
  238. float f;
  239. } tmp;
  240. if (exponent == 0)
  241. {
  242. if (significand == 0)
  243. {
  244. // zero
  245. tmp.u = (sign << 31);
  246. }
  247. else
  248. {
  249. // denormal
  250. exponent = 0;
  251. // find non-zero bit
  252. while ((significand & 0x200) == 0)
  253. {
  254. significand <<= 1;
  255. exponent++;
  256. }
  257. significand <<= 1;
  258. significand &= 0x3FF;
  259. tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13);
  260. }
  261. }
  262. else if (exponent == 0x1F)
  263. {
  264. // infinity or NaN
  265. tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13);
  266. }
  267. else
  268. {
  269. // normalized
  270. tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13);
  271. }
  272. return tmp.f;
  273. }
  274. Mat Mat::from_float16(const unsigned short* data, int size)
  275. {
  276. Mat m(size);
  277. if (m.empty())
  278. return m;
  279. float* ptr = m;//.data;
  280. #if __ARM_NEON && (__ARM_FP & 2)
  281. int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
  282. int remain = size - (nn << 2);
  283. #else
  284. int remain = size;
  285. #endif // __ARM_NEON
  286. #if __ARM_NEON && (__ARM_FP & 2)
  287. #if __aarch64__
  288. if (nn > 0)
  289. {
  290. asm volatile(
  291. "0: \n"
  292. "ld1 {v0.4h}, [%1], #8 \n"
  293. "fcvtl v1.4s, v0.4h \n"
  294. "subs %w0, %w0, #1 \n"
  295. "st1 {v1.4s}, [%2], #16 \n"
  296. "bne 0b \n"
  297. : "=r"(nn), // %0
  298. "=r"(data), // %1
  299. "=r"(ptr) // %2
  300. : "0"(nn),
  301. "1"(data),
  302. "2"(ptr)
  303. : "cc", "memory", "v0", "v1"
  304. );
  305. }
  306. #else
  307. if (nn > 0)
  308. {
  309. asm volatile(
  310. "0: \n"
  311. "pld [%1, #64] \n"
  312. "vld1.s16 {d0}, [%1 :64]! \n"
  313. "vcvt.f32.f16 q1, d0 \n"
  314. "subs %0, #1 \n"
  315. "vst1.f32 {d2-d3}, [%2 :128]! \n"
  316. "bne 0b \n"
  317. : "=r"(nn), // %0
  318. "=r"(data), // %1
  319. "=r"(ptr) // %2
  320. : "0"(nn),
  321. "1"(data),
  322. "2"(ptr)
  323. : "cc", "memory", "q0", "q1"
  324. );
  325. }
  326. #endif // __aarch64__
  327. #endif // __ARM_NEON
  328. for (; remain>0; remain--)
  329. {
  330. *ptr = half2float(*data);
  331. data++;
  332. ptr++;
  333. }
  334. return m;
  335. }
  336. void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, Allocator* allocator, int num_threads)
  337. {
  338. ncnn::Layer* padding = ncnn::create_layer(ncnn::LayerType::Padding);
  339. ncnn::ParamDict pd;
  340. pd.set(0, top);
  341. pd.set(1, bottom);
  342. pd.set(2, left);
  343. pd.set(3, right);
  344. pd.set(4, type);
  345. pd.set(5, v);
  346. padding->load_param(pd);
  347. ncnn::Option opt = ncnn::get_default_option();
  348. opt.num_threads = num_threads;
  349. opt.blob_allocator = allocator;
  350. padding->forward(src, dst, opt);
  351. delete padding;
  352. }
  353. static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
  354. {
  355. int w = dst.w;
  356. int h = dst.h;
  357. const float* ptr = src.row(top) + left;//.data + src.w * top + left;
  358. float* outptr = dst;//.data;
  359. for (int y = 0; y < h; y++)
  360. {
  361. if(w < 12)
  362. {
  363. for (int x = 0; x < w; x++)
  364. {
  365. outptr[x] = ptr[x];
  366. }
  367. }
  368. else
  369. {
  370. memcpy(outptr, ptr, w*sizeof(float));
  371. }
  372. outptr += w;
  373. ptr += src.w;
  374. }
  375. }
  376. void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, Allocator* allocator, int num_threads)
  377. {
  378. int w = src.w - left - right;
  379. int h = src.h - top - bottom;
  380. size_t elemsize = src.elemsize;
  381. if (w == src.w && h == src.h)
  382. {
  383. dst = src;
  384. return;
  385. }
  386. if (src.dims == 2)
  387. {
  388. dst.create(w, h, elemsize, allocator);
  389. if (dst.empty())
  390. return;
  391. copy_cut_border_image(src, dst, top, left);
  392. }
  393. else if (src.dims == 3)
  394. {
  395. int channels = src.c;
  396. dst.create(w, h, channels, elemsize, allocator);
  397. if (dst.empty())
  398. return;
  399. // unroll image channel
  400. #pragma omp parallel for num_threads(num_threads)
  401. for (int q=0; q<channels; q++)
  402. {
  403. const Mat m = src.channel(q);
  404. Mat cutm = dst.channel(q);
  405. copy_cut_border_image(m, cutm, top, left);
  406. }
  407. }
  408. }
  409. static void resize_bilinear_image(const Mat& src, Mat& dst, int w, int h)
  410. {
  411. double scale_x = (double)src.w / w;
  412. double scale_y = (double)src.h / h;
  413. int* buf = new int[w + h + w*2 + h*2];
  414. int* xofs = buf;//new int[w];
  415. int* yofs = buf + w;//new int[h];
  416. float* alpha = (float*)(buf + w + h);//new float[w * 2];
  417. float* beta = (float*)(buf + w + h + w*2);//new float[h * 2];
  418. float fx;
  419. float fy;
  420. int sx;
  421. int sy;
  422. for (int dx = 0; dx < w; dx++)
  423. {
  424. fx = (float)((dx + 0.5) * scale_x - 0.5);
  425. sx = floor(fx);
  426. fx -= sx;
  427. if (sx < 0)
  428. {
  429. sx = 0;
  430. fx = 0.f;
  431. }
  432. if (sx >= src.w - 1)
  433. {
  434. sx = src.w - 2;
  435. fx = 1.f;
  436. }
  437. xofs[dx] = sx;
  438. alpha[dx*2 ] = 1.f - fx;
  439. alpha[dx*2 + 1] = fx;
  440. }
  441. for (int dy = 0; dy < h; dy++)
  442. {
  443. fy = (float)((dy + 0.5) * scale_y - 0.5);
  444. sy = floor(fy);
  445. fy -= sy;
  446. if (sy < 0)
  447. {
  448. sy = 0;
  449. fy = 0.f;
  450. }
  451. if (sy >= src.h - 1)
  452. {
  453. sy = src.h - 2;
  454. fy = 1.f;
  455. }
  456. yofs[dy] = sy;
  457. beta[dy*2 ] = 1.f - fy;
  458. beta[dy*2 + 1] = fy;
  459. }
  460. // loop body
  461. Mat rowsbuf0(w + 1);
  462. Mat rowsbuf1(w + 1);
  463. float* rows0 = rowsbuf0;
  464. float* rows1 = rowsbuf1;
  465. int prev_sy1 = -1;
  466. for (int dy = 0; dy < h; dy++ )
  467. {
  468. int sy = yofs[dy];
  469. if (sy == prev_sy1)
  470. {
  471. // hresize one row
  472. float* rows0_old = rows0;
  473. rows0 = rows1;
  474. rows1 = rows0_old;
  475. const float* S1 = src.row(sy+1);
  476. const float* alphap = alpha;
  477. float* rows1p = rows1;
  478. int dx = 0;
  479. #if __ARM_NEON
  480. for ( ; dx+1 < w; dx += 2 )
  481. {
  482. int sx = xofs[dx];
  483. int sxn = xofs[dx+1];
  484. const float* S1p = S1 + sx;
  485. const float* S1np = S1 + sxn;
  486. float32x4_t _a = vld1q_f32(alphap);
  487. float32x2_t _S1 = vld1_f32(S1p);
  488. float32x2_t _S1n = vld1_f32(S1np);
  489. float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
  490. float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
  491. float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
  492. vst1_f32(rows1p + dx, _rows1);
  493. alphap += 4;
  494. }
  495. #endif // __ARM_NEON
  496. for ( ; dx < w; dx++ )
  497. {
  498. int sx = xofs[dx];
  499. const float* S1p = S1 + sx;
  500. float a0 = alphap[0];
  501. float a1 = alphap[1];
  502. rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
  503. alphap += 2;
  504. }
  505. }
  506. else
  507. {
  508. // hresize two rows
  509. const float* S0 = src.row(sy);
  510. const float* S1 = src.row(sy+1);
  511. const float* alphap = alpha;
  512. float* rows0p = rows0;
  513. float* rows1p = rows1;
  514. int dx = 0;
  515. #if __ARM_NEON
  516. for ( ; dx+1 < w; dx += 2 )
  517. {
  518. int sx = xofs[dx];
  519. int sxn = xofs[dx+1];
  520. const float* S0p = S0 + sx;
  521. const float* S1p = S1 + sx;
  522. const float* S0np = S0 + sxn;
  523. const float* S1np = S1 + sxn;
  524. float32x4_t _a = vld1q_f32(alphap);
  525. float32x2_t _S0 = vld1_f32(S0p);
  526. float32x2_t _S1 = vld1_f32(S1p);
  527. float32x2_t _S0n = vld1_f32(S0np);
  528. float32x2_t _S1n = vld1_f32(S1np);
  529. float32x4_t _S0S0n = vcombine_f32(_S0, _S0n);
  530. float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
  531. float32x4_t _ms0 = vmulq_f32(_S0S0n, _a);
  532. float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
  533. float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
  534. float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
  535. vst1_f32(rows0p + dx, _rows0);
  536. vst1_f32(rows1p + dx, _rows1);
  537. alphap += 4;
  538. }
  539. #endif // __ARM_NEON
  540. for ( ; dx < w; dx++ )
  541. {
  542. int sx = xofs[dx];
  543. const float* S0p = S0 + sx;
  544. const float* S1p = S1 + sx;
  545. float a0 = alphap[0];
  546. float a1 = alphap[1];
  547. rows0p[dx] = S0p[0]*a0 + S0p[1]*a1;
  548. rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
  549. alphap += 2;
  550. }
  551. }
  552. prev_sy1 = sy + 1;
  553. // vresize
  554. float b0 = beta[0];
  555. float b1 = beta[1];
  556. float* rows0p = rows0;
  557. float* rows1p = rows1;
  558. float* Dp = dst.row(dy);
  559. #if __ARM_NEON
  560. int nn = w >> 3;
  561. #else
  562. int nn = 0;
  563. #endif
  564. int remain = w - (nn << 3);
  565. #if __ARM_NEON
  566. float32x4_t _b0 = vdupq_n_f32(b0);
  567. float32x4_t _b1 = vdupq_n_f32(b1);
  568. for (; nn>0; nn--)
  569. {
  570. float32x4_t _rows0 = vld1q_f32(rows0p);
  571. float32x4_t _rows1 = vld1q_f32(rows1p);
  572. float32x4_t _D = vmulq_f32(_rows0, _b0);
  573. _D = vmlaq_f32(_D, _rows1, _b1);
  574. vst1q_f32(Dp, _D);
  575. float32x4_t _rows0n = vld1q_f32(rows0p+4);
  576. float32x4_t _rows1n = vld1q_f32(rows1p+4);
  577. float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
  578. _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
  579. vst1q_f32(Dp+4, _Dn);
  580. Dp += 8;
  581. rows0p += 8;
  582. rows1p += 8;
  583. }
  584. #endif // __ARM_NEON
  585. for ( ; remain; --remain )
  586. {
  587. // D[x] = rows0[x]*b0 + rows1[x]*b1;
  588. *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
  589. }
  590. beta += 2;
  591. }
  592. delete[] buf;
  593. }
  594. void resize_bilinear(const Mat& src, Mat& dst, int w, int h, Allocator* allocator, int num_threads)
  595. {
  596. if (w == src.w && h == src.h)
  597. {
  598. dst = src;
  599. return;
  600. }
  601. size_t elemsize = src.elemsize;
  602. if (src.dims == 2)
  603. {
  604. dst.create(w, h, elemsize, allocator);
  605. if (dst.empty())
  606. return;
  607. resize_bilinear_image(src, dst, w, h);
  608. }
  609. else if (src.dims == 3)
  610. {
  611. int channels = src.c;
  612. dst.create(w, h, channels, elemsize, allocator);
  613. if (dst.empty())
  614. return;
  615. // unroll image channel
  616. #pragma omp parallel for num_threads(num_threads)
  617. for (int q=0; q<channels; q++)
  618. {
  619. const Mat m = src.channel(q);
  620. Mat resizem = dst.channel(q);
  621. resize_bilinear_image(m, resizem, w, h);
  622. }
  623. }
  624. }
  625. void convert_packing(const Mat& src, Mat& dst, int _packing, Allocator* allocator, int num_threads)
  626. {
  627. ncnn::Layer* packing = ncnn::create_layer(ncnn::LayerType::Packing);
  628. ncnn::ParamDict pd;
  629. pd.set(0, _packing);
  630. packing->load_param(pd);
  631. ncnn::Option opt = ncnn::get_default_option();
  632. opt.num_threads = num_threads;
  633. opt.blob_allocator = allocator;
  634. packing->forward(src, dst, opt);
  635. delete packing;
  636. }
  637. } // namespace ncnn