You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat.cpp 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #if __ARM_NEON
  16. #include <arm_neon.h>
  17. #endif // __ARM_NEON
  18. #include "cpu.h"
  19. namespace ncnn {
  20. void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_vals)
  21. {
  22. int size = w * h;
  23. if (mean_vals && !norm_vals)
  24. {
  25. // substract mean only
  26. #pragma omp parallel for
  27. for (int q=0; q<c; q++)
  28. {
  29. float* ptr = data + cstep * q;
  30. const float mean = mean_vals[q];
  31. #if __ARM_NEON
  32. int nn = size >> 2;
  33. int remain = size - (nn << 2);
  34. #else
  35. int remain = size;
  36. #endif // __ARM_NEON
  37. #if __ARM_NEON
  38. #if __aarch64__
  39. float32x4_t _mean = vdupq_n_f32(mean);
  40. for (; nn>0; nn--)
  41. {
  42. float32x4_t _ptr = vld1q_f32(ptr);
  43. _ptr = vsubq_f32(_ptr, _mean);
  44. vst1q_f32(ptr, _ptr);
  45. ptr += 4;
  46. }
  47. #else
  48. if (nn > 0)
  49. {
  50. asm volatile(
  51. "vdup.f32 q1, %4 \n"
  52. "0: \n"
  53. "pld [%1, #128] \n"
  54. "vld1.f32 {d0-d1}, [%1 :128] \n"
  55. "vsub.f32 q0, q0, q1 \n"
  56. "subs %0, #1 \n"
  57. "vst1.f32 {d0-d1}, [%1 :128]! \n"
  58. "bne 0b \n"
  59. : "=r"(nn), // %0
  60. "=r"(ptr) // %1
  61. : "0"(nn),
  62. "1"(ptr),
  63. "r"(mean) // %4
  64. : "cc", "memory", "q0", "q1"
  65. );
  66. }
  67. #endif // __aarch64__
  68. #endif // __ARM_NEON
  69. for (; remain>0; remain--)
  70. {
  71. *ptr -= mean;
  72. ptr++;
  73. }
  74. }
  75. }
  76. else if (!mean_vals && norm_vals)
  77. {
  78. // normalize only
  79. #pragma omp parallel for
  80. for (int q=0; q<c; q++)
  81. {
  82. float* ptr = data + cstep * q;
  83. const float norm = norm_vals[q];
  84. #if __ARM_NEON
  85. int nn = size >> 2;
  86. int remain = size - (nn << 2);
  87. #else
  88. int remain = size;
  89. #endif // __ARM_NEON
  90. #if __ARM_NEON
  91. #if __aarch64__
  92. float32x4_t _norm = vdupq_n_f32(norm);
  93. for (; nn>0; nn--)
  94. {
  95. float32x4_t _ptr = vld1q_f32(ptr);
  96. _ptr = vmulq_f32(_ptr, _norm);
  97. vst1q_f32(ptr, _ptr);
  98. ptr += 4;
  99. }
  100. #else
  101. if (nn > 0)
  102. {
  103. asm volatile(
  104. "vdup.f32 q1, %4 \n"
  105. "0: \n"
  106. "pld [%1, #128] \n"
  107. "vld1.f32 {d0-d1}, [%1 :128] \n"
  108. "vmul.f32 q0, q0, q1 \n"
  109. "subs %0, #1 \n"
  110. "vst1.f32 {d0-d1}, [%1 :128]! \n"
  111. "bne 0b \n"
  112. : "=r"(nn), // %0
  113. "=r"(ptr) // %1
  114. : "0"(nn),
  115. "1"(ptr),
  116. "r"(norm) // %4
  117. : "cc", "memory", "q0", "q1"
  118. );
  119. }
  120. #endif // __aarch64__
  121. #endif // __ARM_NEON
  122. for (; remain>0; remain--)
  123. {
  124. *ptr *= norm;
  125. ptr++;
  126. }
  127. }
  128. }
  129. else if (mean_vals && norm_vals)
  130. {
  131. // substract mean and normalize
  132. #pragma omp parallel for
  133. for (int q=0; q<c; q++)
  134. {
  135. float* ptr = data + cstep * q;
  136. const float mean = mean_vals[q];
  137. const float norm = norm_vals[q];
  138. #if __ARM_NEON
  139. int nn = size >> 2;
  140. int remain = size - (nn << 2);
  141. #else
  142. int remain = size;
  143. #endif // __ARM_NEON
  144. #if __ARM_NEON
  145. #if __aarch64__
  146. float32x4_t _mean = vdupq_n_f32(mean);
  147. float32x4_t _norm = vdupq_n_f32(norm);
  148. for (; nn>0; nn--)
  149. {
  150. float32x4_t _ptr = vld1q_f32(ptr);
  151. _ptr = vsubq_f32(_ptr, _mean);
  152. _ptr = vmulq_f32(_ptr, _norm);
  153. vst1q_f32(ptr, _ptr);
  154. ptr += 4;
  155. }
  156. #else
  157. if (nn > 0)
  158. {
  159. asm volatile(
  160. "vdup.f32 q1, %4 \n"
  161. "vdup.f32 q2, %5 \n"
  162. "0: \n"
  163. "pld [%1, #128] \n"
  164. "vld1.f32 {d0-d1}, [%1 :128] \n"
  165. "vsub.f32 q0, q0, q1 \n"
  166. "vmul.f32 q0, q0, q2 \n"
  167. "subs %0, #1 \n"
  168. "vst1.f32 {d0-d1}, [%1 :128]! \n"
  169. "bne 0b \n"
  170. : "=r"(nn), // %0
  171. "=r"(ptr) // %1
  172. : "0"(nn),
  173. "1"(ptr),
  174. "r"(mean), // %4
  175. "r"(norm) // %5
  176. : "cc", "memory", "q0", "q1", "q2"
  177. );
  178. }
  179. #endif // __aarch64__
  180. #endif // __ARM_NEON
  181. for (; remain>0; remain--)
  182. {
  183. *ptr = (*ptr - mean) * norm;
  184. ptr++;
  185. }
  186. }
  187. }
  188. }
  189. // convert half precision floating point to float
  190. static float half2float(unsigned short value)
  191. {
  192. // 1 : 5 : 10
  193. unsigned short sign = (value & 0x8000) >> 15;
  194. unsigned short exponent = (value & 0x7c00) >> 10;
  195. unsigned short significand = value & 0x03FF;
  196. // fprintf(stderr, "%d %d %d\n", sign, exponent, significand);
  197. // 1 : 8 : 23
  198. union
  199. {
  200. unsigned int u;
  201. float f;
  202. } tmp;
  203. if (exponent == 0)
  204. {
  205. if (significand == 0)
  206. {
  207. // zero
  208. tmp.u = (sign << 31);
  209. }
  210. else
  211. {
  212. // denormal
  213. exponent = 0;
  214. // find non-zero bit
  215. while ((significand & 0x200) == 0)
  216. {
  217. significand <<= 1;
  218. exponent++;
  219. }
  220. significand <<= 1;
  221. significand &= 0x3FF;
  222. tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13);
  223. }
  224. }
  225. else if (exponent == 0x1F)
  226. {
  227. // infinity or NaN
  228. tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13);
  229. }
  230. else
  231. {
  232. // normalized
  233. tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13);
  234. }
  235. return tmp.f;
  236. }
  237. Mat Mat::from_float16(const unsigned short* data, int size)
  238. {
  239. Mat m(size);
  240. if (m.empty())
  241. return m;
  242. float* ptr = m.data;
  243. #if __ARM_NEON && (__ARM_FP & 2)
  244. int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
  245. int remain = size - (nn << 2);
  246. #else
  247. int remain = size;
  248. #endif // __ARM_NEON
  249. #if __ARM_NEON && (__ARM_FP & 2)
  250. #if __aarch64__
  251. if (nn > 0)
  252. {
  253. asm volatile(
  254. "0: \n"
  255. "ldr d0, [%1], #8 \n"
  256. "fcvtl v1.4s, v0.4h \n"
  257. "subs %w0, %w0, #1 \n"
  258. "str q1, [%2], #16 \n"
  259. "bne 0b \n"
  260. : "=r"(nn), // %0
  261. "=r"(data), // %1
  262. "=r"(ptr) // %2
  263. : "0"(nn),
  264. "1"(data),
  265. "2"(ptr)
  266. : "cc", "memory", "v0", "v1"
  267. );
  268. }
  269. #else
  270. if (nn > 0)
  271. {
  272. asm volatile(
  273. "0: \n"
  274. "pld [%1, #64] \n"
  275. "vld1.s16 {d0}, [%1 :64]! \n"
  276. "vcvt.f32.f16 q1, d0 \n"
  277. "subs %0, #1 \n"
  278. "vst1.f32 {d2-d3}, [%2 :128]! \n"
  279. "bne 0b \n"
  280. : "=r"(nn), // %0
  281. "=r"(data), // %1
  282. "=r"(ptr) // %2
  283. : "0"(nn),
  284. "1"(data),
  285. "2"(ptr)
  286. : "cc", "memory", "q0", "q1"
  287. );
  288. }
  289. #endif // __aarch64__
  290. #endif // __ARM_NEON
  291. for (; remain>0; remain--)
  292. {
  293. *ptr = half2float(*data);
  294. data++;
  295. ptr++;
  296. }
  297. return m;
  298. }
  299. static void copy_make_border_image(const Mat& src, Mat& dst, int top, int left, int type, float v)
  300. {
  301. int w = dst.w;
  302. int h = dst.h;
  303. const float* ptr = src.data;
  304. float* outptr = dst.data;
  305. if (type == BORDER_CONSTANT)
  306. {
  307. int y = 0;
  308. // fill top
  309. for (; y < top; y++)
  310. {
  311. int x = 0;
  312. for (; x < w; x++)
  313. {
  314. outptr[x] = v;
  315. }
  316. outptr += w;
  317. }
  318. // fill center
  319. for (; y < (top + src.h); y++)
  320. {
  321. int x = 0;
  322. for (; x < left; x++)
  323. {
  324. outptr[x] = v;
  325. }
  326. for (; x < (left + src.w); x++)
  327. {
  328. outptr[x] = ptr[x - left];
  329. }
  330. for (; x < w; x++)
  331. {
  332. outptr[x] = v;
  333. }
  334. ptr += src.w;
  335. outptr += w;
  336. }
  337. // fill bottom
  338. for (; y < h; y++)
  339. {
  340. int x = 0;
  341. for (; x < w; x++)
  342. {
  343. outptr[x] = v;
  344. }
  345. outptr += w;
  346. }
  347. }
  348. else if (type == BORDER_REPLICATE)
  349. {
  350. int y = 0;
  351. // fill top
  352. for (; y < top; y++)
  353. {
  354. int x = 0;
  355. for (; x < left; x++)
  356. {
  357. outptr[x] = ptr[0];
  358. }
  359. for (; x < (left + src.w); x++)
  360. {
  361. outptr[x] = ptr[x - left];
  362. }
  363. for (; x < w; x++)
  364. {
  365. outptr[x] = ptr[src.w - 1];
  366. }
  367. outptr += w;
  368. }
  369. // fill center
  370. for (; y < (top + src.h); y++)
  371. {
  372. int x = 0;
  373. for (; x < left; x++)
  374. {
  375. outptr[x] = ptr[0];
  376. }
  377. for (; x < (left + src.w); x++)
  378. {
  379. outptr[x] = ptr[x - left];
  380. }
  381. for (; x < w; x++)
  382. {
  383. outptr[x] = ptr[src.w - 1];
  384. }
  385. ptr += src.w;
  386. outptr += w;
  387. }
  388. // fill bottom
  389. ptr -= src.w;
  390. for (; y < h; y++)
  391. {
  392. int x = 0;
  393. for (; x < left; x++)
  394. {
  395. outptr[x] = ptr[0];
  396. }
  397. for (; x < (left + src.w); x++)
  398. {
  399. outptr[x] = ptr[x - left];
  400. }
  401. for (; x < w; x++)
  402. {
  403. outptr[x] = ptr[src.w - 1];
  404. }
  405. outptr += w;
  406. }
  407. }
  408. }
  409. void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v)
  410. {
  411. int w = src.w + left + right;
  412. int h = src.h + top + bottom;
  413. if (src.dims == 2)
  414. {
  415. dst.create(w, h);
  416. if (dst.empty())
  417. return;
  418. copy_make_border_image(src, dst, top, left, type, v);
  419. }
  420. else if (src.dims == 3)
  421. {
  422. int channels = src.c;
  423. dst.create(w, h, channels);
  424. if (dst.empty())
  425. return;
  426. // unroll image channel
  427. #pragma omp parallel for
  428. for (int q=0; q<channels; q++)
  429. {
  430. const Mat m = src.channel(q);
  431. Mat borderm = dst.channel(q);
  432. copy_make_border_image(m, borderm, top, left, type, v);
  433. }
  434. }
  435. }
  436. static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
  437. {
  438. int w = dst.w;
  439. int h = dst.h;
  440. const float* ptr = src.data + src.w * top + left;
  441. float* outptr = dst.data;
  442. for (int y = 0; y < h; y++)
  443. {
  444. for (int x = 0; x < w; x++)
  445. {
  446. outptr[x] = ptr[x];
  447. }
  448. outptr += w;
  449. ptr += src.w;
  450. }
  451. }
  452. void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
  453. {
  454. int w = src.w - left - right;
  455. int h = src.h - top - bottom;
  456. if (src.dims == 2)
  457. {
  458. dst.create(w, h);
  459. if (dst.empty())
  460. return;
  461. copy_cut_border_image(src, dst, top, left);
  462. }
  463. else if (src.dims == 3)
  464. {
  465. int channels = src.c;
  466. dst.create(w, h, channels);
  467. if (dst.empty())
  468. return;
  469. // unroll image channel
  470. #pragma omp parallel for
  471. for (int q=0; q<channels; q++)
  472. {
  473. const Mat m = src.channel(q);
  474. Mat cutm = dst.channel(q);
  475. copy_cut_border_image(m, cutm, top, left);
  476. }
  477. }
  478. }
  479. } // namespace ncnn