You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

interp_arm.cpp 14 kB

6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "interp_arm.h"
  15. #include <math.h>
  16. #if __ARM_NEON
  17. #include <arm_neon.h>
  18. #endif // __ARM_NEON
  19. namespace ncnn {
  20. #include "interp_bicubic.h"
  21. #include "interp_bicubic_bf16s.h"
  22. #include "interp_bilinear.h"
  23. #include "interp_bilinear_bf16s.h"
  24. #if __ARM_NEON
  25. #include "interp_bicubic_pack4.h"
  26. #include "interp_bicubic_pack4_bf16s.h"
  27. #include "interp_bilinear_pack4.h"
  28. #include "interp_bilinear_pack4_bf16s.h"
  29. #endif
  30. Interp_arm::Interp_arm()
  31. {
  32. #if __ARM_NEON
  33. support_packing = true;
  34. #endif // __ARM_NEON
  35. support_bf16_storage = true;
  36. }
  37. int Interp_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  38. {
  39. if (opt.use_bf16_storage)
  40. return forward_bf16s(bottom_blob, top_blob, opt);
  41. int h = bottom_blob.h;
  42. int w = bottom_blob.w;
  43. int channels = bottom_blob.c;
  44. int dims = bottom_blob.dims;
  45. size_t elemsize = bottom_blob.elemsize;
  46. int elempack = bottom_blob.elempack;
  47. if (dims == 1)
  48. {
  49. return Interp::forward(bottom_blob, top_blob, opt);
  50. }
  51. int outh = output_height;
  52. int outw = output_width;
  53. if (outh == 0 || outw == 0)
  54. {
  55. outh = h * height_scale;
  56. outw = w * width_scale;
  57. }
  58. if (outh == h && outw == w)
  59. {
  60. top_blob = bottom_blob;
  61. return 0;
  62. }
  63. top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
  64. if (top_blob.empty())
  65. return -100;
  66. #if __ARM_NEON
  67. if (elempack == 4)
  68. {
  69. if (resize_type == 1) // nearest
  70. {
  71. const float hs = outh ? h / (float)outh : 1.f / height_scale;
  72. const float ws = outw ? w / (float)outw : 1.f / width_scale;
  73. #pragma omp parallel for num_threads(opt.num_threads)
  74. for (int q = 0; q < channels; q++)
  75. {
  76. const Mat src = bottom_blob.channel(q);
  77. Mat dst = top_blob.channel(q);
  78. for (int y = 0; y < outh; y++)
  79. {
  80. int in_y = std::min((int)(y * hs), (h - 1));
  81. const float* ptr = src.row(in_y);
  82. float* outptr = dst.row(y);
  83. for (int x = 0; x < outw; x++)
  84. {
  85. int in_x = std::min((int)(x * ws), (w - 1));
  86. float32x4_t _p = vld1q_f32(ptr + in_x * 4);
  87. vst1q_f32(outptr, _p);
  88. outptr += 4;
  89. }
  90. }
  91. }
  92. }
  93. if (resize_type == 2) // bilinear
  94. {
  95. int* buf = new int[outw + outh + outw * 2 + outh * 2];
  96. int* xofs = buf; //new int[outw];
  97. int* yofs = buf + outw; //new int[outh];
  98. float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
  99. float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
  100. linear_coeffs(w, outw, xofs, alpha);
  101. linear_coeffs(h, outh, yofs, beta);
  102. #pragma omp parallel for num_threads(opt.num_threads)
  103. for (int q = 0; q < channels; q++)
  104. {
  105. const Mat src = bottom_blob.channel(q);
  106. Mat dst = top_blob.channel(q);
  107. resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
  108. }
  109. delete[] buf;
  110. }
  111. if (resize_type == 3) // bicubic
  112. {
  113. int* buf = new int[outw + outh + outw * 4 + outh * 4];
  114. int* xofs = buf; //new int[outw];
  115. int* yofs = buf + outw; //new int[outh];
  116. float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
  117. float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
  118. cubic_coeffs(w, outw, xofs, alpha);
  119. cubic_coeffs(h, outh, yofs, beta);
  120. #pragma omp parallel for num_threads(opt.num_threads)
  121. for (int q = 0; q < channels; q++)
  122. {
  123. const Mat src = bottom_blob.channel(q);
  124. Mat dst = top_blob.channel(q);
  125. resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
  126. }
  127. delete[] buf;
  128. }
  129. return 0;
  130. }
  131. #endif // __ARM_NEON
  132. if (resize_type == 1) // nearest
  133. {
  134. const float hs = outh ? h / (float)outh : 1.f / height_scale;
  135. const float ws = outw ? w / (float)outw : 1.f / width_scale;
  136. #pragma omp parallel for num_threads(opt.num_threads)
  137. for (int q = 0; q < channels; q++)
  138. {
  139. const Mat src = bottom_blob.channel(q);
  140. Mat dst = top_blob.channel(q);
  141. for (int y = 0; y < outh; y++)
  142. {
  143. int in_y = std::min((int)(y * hs), (h - 1));
  144. const float* ptr = src.row(in_y);
  145. float* outptr = dst.row(y);
  146. for (int x = 0; x < outw; x++)
  147. {
  148. int in_x = std::min((int)(x * ws), (w - 1));
  149. *outptr++ = ptr[in_x];
  150. }
  151. }
  152. }
  153. }
  154. if (resize_type == 2) // bilinear
  155. {
  156. int* buf = new int[outw + outh + outw * 2 + outh * 2];
  157. int* xofs = buf; //new int[outw];
  158. int* yofs = buf + outw; //new int[outh];
  159. float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
  160. float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
  161. linear_coeffs(w, outw, xofs, alpha);
  162. linear_coeffs(h, outh, yofs, beta);
  163. #pragma omp parallel for num_threads(opt.num_threads)
  164. for (int q = 0; q < channels; q++)
  165. {
  166. const Mat src = bottom_blob.channel(q);
  167. Mat dst = top_blob.channel(q);
  168. resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
  169. }
  170. delete[] buf;
  171. }
  172. if (resize_type == 3) // bicubic
  173. {
  174. int* buf = new int[outw + outh + outw * 4 + outh * 4];
  175. int* xofs = buf; //new int[outw];
  176. int* yofs = buf + outw; //new int[outh];
  177. float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
  178. float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
  179. cubic_coeffs(w, outw, xofs, alpha);
  180. cubic_coeffs(h, outh, yofs, beta);
  181. #pragma omp parallel for num_threads(opt.num_threads)
  182. for (int q = 0; q < channels; q++)
  183. {
  184. const Mat src = bottom_blob.channel(q);
  185. Mat dst = top_blob.channel(q);
  186. resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
  187. }
  188. delete[] buf;
  189. }
  190. return 0;
  191. }
  192. int Interp_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  193. {
  194. int h = bottom_blob.h;
  195. int w = bottom_blob.w;
  196. int channels = bottom_blob.c;
  197. int dims = bottom_blob.dims;
  198. size_t elemsize = bottom_blob.elemsize;
  199. int elempack = bottom_blob.elempack;
  200. if (dims == 1)
  201. {
  202. return Interp::forward(bottom_blob, top_blob, opt);
  203. }
  204. int outh = output_height;
  205. int outw = output_width;
  206. if (outh == 0 || outw == 0)
  207. {
  208. outh = h * height_scale;
  209. outw = w * width_scale;
  210. }
  211. if (outh == h && outw == w)
  212. {
  213. top_blob = bottom_blob;
  214. return 0;
  215. }
  216. top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
  217. if (top_blob.empty())
  218. return -100;
  219. #if __ARM_NEON
  220. if (elempack == 4)
  221. {
  222. if (resize_type == 1) // nearest
  223. {
  224. const float hs = outh ? h / (float)outh : 1.f / height_scale;
  225. const float ws = outw ? w / (float)outw : 1.f / width_scale;
  226. #pragma omp parallel for num_threads(opt.num_threads)
  227. for (int q = 0; q < channels; q++)
  228. {
  229. const Mat src = bottom_blob.channel(q);
  230. Mat dst = top_blob.channel(q);
  231. for (int y = 0; y < outh; y++)
  232. {
  233. int in_y = std::min((int)(y * hs), (h - 1));
  234. const unsigned short* ptr = src.row<const unsigned short>(in_y);
  235. unsigned short* outptr = dst.row<unsigned short>(y);
  236. for (int x = 0; x < outw; x++)
  237. {
  238. int in_x = std::min((int)(x * ws), (w - 1));
  239. uint16x4_t _p = vld1_u16(ptr + in_x * 4);
  240. vst1_u16(outptr, _p);
  241. outptr += 4;
  242. }
  243. }
  244. }
  245. }
  246. if (resize_type == 2) // bilinear
  247. {
  248. int* buf = new int[outw + outh + outw * 2 + outh * 2];
  249. int* xofs = buf; //new int[outw];
  250. int* yofs = buf + outw; //new int[outh];
  251. float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
  252. float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
  253. linear_coeffs(w, outw, xofs, alpha);
  254. linear_coeffs(h, outh, yofs, beta);
  255. #pragma omp parallel for num_threads(opt.num_threads)
  256. for (int q = 0; q < channels; q++)
  257. {
  258. const Mat src = bottom_blob.channel(q);
  259. Mat dst = top_blob.channel(q);
  260. resize_bilinear_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
  261. }
  262. delete[] buf;
  263. }
  264. if (resize_type == 3) // bicubic
  265. {
  266. int* buf = new int[outw + outh + outw * 4 + outh * 4];
  267. int* xofs = buf; //new int[outw];
  268. int* yofs = buf + outw; //new int[outh];
  269. float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
  270. float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
  271. cubic_coeffs(w, outw, xofs, alpha);
  272. cubic_coeffs(h, outh, yofs, beta);
  273. #pragma omp parallel for num_threads(opt.num_threads)
  274. for (int q = 0; q < channels; q++)
  275. {
  276. const Mat src = bottom_blob.channel(q);
  277. Mat dst = top_blob.channel(q);
  278. resize_bicubic_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
  279. }
  280. delete[] buf;
  281. }
  282. return 0;
  283. }
  284. #endif // __ARM_NEON
  285. if (resize_type == 1) // nearest
  286. {
  287. const float hs = outh ? h / (float)outh : 1.f / height_scale;
  288. const float ws = outw ? w / (float)outw : 1.f / width_scale;
  289. #pragma omp parallel for num_threads(opt.num_threads)
  290. for (int q = 0; q < channels; q++)
  291. {
  292. const Mat src = bottom_blob.channel(q);
  293. Mat dst = top_blob.channel(q);
  294. for (int y = 0; y < outh; y++)
  295. {
  296. int in_y = std::min((int)(y * hs), (h - 1));
  297. const unsigned short* ptr = src.row<const unsigned short>(in_y);
  298. unsigned short* outptr = dst.row<unsigned short>(y);
  299. for (int x = 0; x < outw; x++)
  300. {
  301. int in_x = std::min((int)(x * ws), (w - 1));
  302. *outptr++ = ptr[in_x];
  303. }
  304. }
  305. }
  306. }
  307. if (resize_type == 2) // bilinear
  308. {
  309. int* buf = new int[outw + outh + outw * 2 + outh * 2];
  310. int* xofs = buf; //new int[outw];
  311. int* yofs = buf + outw; //new int[outh];
  312. float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
  313. float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
  314. linear_coeffs(w, outw, xofs, alpha);
  315. linear_coeffs(h, outh, yofs, beta);
  316. #pragma omp parallel for num_threads(opt.num_threads)
  317. for (int q = 0; q < channels; q++)
  318. {
  319. const Mat src = bottom_blob.channel(q);
  320. Mat dst = top_blob.channel(q);
  321. resize_bilinear_image_bf16s(src, dst, alpha, xofs, beta, yofs);
  322. }
  323. delete[] buf;
  324. }
  325. if (resize_type == 3) // bicubic
  326. {
  327. int* buf = new int[outw + outh + outw * 4 + outh * 4];
  328. int* xofs = buf; //new int[outw];
  329. int* yofs = buf + outw; //new int[outh];
  330. float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
  331. float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
  332. cubic_coeffs(w, outw, xofs, alpha);
  333. cubic_coeffs(h, outh, yofs, beta);
  334. #pragma omp parallel for num_threads(opt.num_threads)
  335. for (int q = 0; q < channels; q++)
  336. {
  337. const Mat src = bottom_blob.channel(q);
  338. Mat dst = top_blob.channel(q);
  339. resize_bicubic_image_bf16s(src, dst, alpha, xofs, beta, yofs);
  340. }
  341. delete[] buf;
  342. }
  343. return 0;
  344. }
  345. } // namespace ncnn