You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

requantize_arm.cpp 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. // SenseNets is pleased to support the open source community by supporting ncnn available.
  2. //
  3. // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "requantize_arm.h"
  15. #include <math.h>
  16. #if __ARM_NEON
  17. #include <arm_neon.h>
  18. #endif // __ARM_NEON
  19. namespace ncnn {
  20. DEFINE_LAYER_CREATOR(Requantize_arm)
  21. static inline signed char float2int8(float v)
  22. {
  23. int int32 = round(v);
  24. if (int32 > 127) return 127;
  25. if (int32 < -128) return -128;
  26. return (signed char)int32;
  27. }
  28. int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  29. {
  30. int dims = bottom_blob.dims;
  31. if (dims == 1)
  32. {
  33. int w = bottom_blob.w;
  34. const int* intptr = bottom_blob;
  35. signed char * ptr = top_blob;
  36. if (bias_term)
  37. {
  38. if (bias_data_size > 1)
  39. {
  40. #pragma omp parallel for num_threads(opt.num_threads)
  41. for (int i=0; i<w; i++)
  42. {
  43. ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
  44. if (fusion_relu && ptr[i] < 0)
  45. ptr[i] = 0;
  46. }
  47. }
  48. else
  49. {
  50. float bias = bias_data[0];
  51. #pragma omp parallel for num_threads(opt.num_threads)
  52. for (int i=0; i<w; i++)
  53. {
  54. ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
  55. if (fusion_relu && ptr[i] < 0)
  56. ptr[i] = 0;
  57. }
  58. }
  59. }
  60. else
  61. {
  62. #pragma omp parallel for num_threads(opt.num_threads)
  63. for (int i=0; i<w; i++)
  64. {
  65. ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
  66. if (fusion_relu && ptr[i] < 0)
  67. ptr[i] = 0;
  68. }
  69. }
  70. }
  71. if (dims == 2)
  72. {
  73. int w = bottom_blob.w;
  74. int h = bottom_blob.h;
  75. if (bias_term)
  76. {
  77. #pragma omp parallel for num_threads(opt.num_threads)
  78. for (int i=0; i<h; i++)
  79. {
  80. const int* intptr = bottom_blob.row<const int>(i);
  81. signed char* ptr = top_blob.row<signed char>(i);
  82. float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];
  83. for (int j=0; j<w; j++)
  84. {
  85. ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
  86. if (fusion_relu && ptr[j] < 0)
  87. ptr[j] = 0;
  88. }
  89. }
  90. }
  91. else
  92. {
  93. #pragma omp parallel for num_threads(opt.num_threads)
  94. for (int i=0; i<h; i++)
  95. {
  96. const int* intptr = bottom_blob.row<const int>(i);
  97. signed char* ptr = top_blob.row<signed char>(i);
  98. for (int j=0; j<w; j++)
  99. {
  100. ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
  101. if (fusion_relu && ptr[j] < 0)
  102. ptr[j] = 0;
  103. }
  104. }
  105. }
  106. }
  107. if (dims == 3)
  108. {
  109. int w = bottom_blob.w;
  110. int h = bottom_blob.h;
  111. int channels = bottom_blob.c;
  112. int size = w * h;
  113. if (bias_term)
  114. {
  115. #pragma omp parallel for num_threads(opt.num_threads)
  116. for (int q=0; q<channels; q++)
  117. {
  118. const int* intptr = bottom_blob.channel(q);
  119. signed char* ptr = top_blob.channel(q);
  120. float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];
  121. #if __ARM_NEON
  122. int nn = size >> 3;
  123. int remain = size & 7;
  124. #if __aarch64__
  125. for (; nn>0; nn--)
  126. {
  127. ptr[0] = float2int8(((intptr[0] * scale_in) + bias) * scale_out);
  128. ptr[1] = float2int8(((intptr[1] * scale_in) + bias) * scale_out);
  129. ptr[2] = float2int8(((intptr[2] * scale_in) + bias) * scale_out);
  130. ptr[3] = float2int8(((intptr[3] * scale_in) + bias) * scale_out);
  131. ptr[4] = float2int8(((intptr[4] * scale_in) + bias) * scale_out);
  132. ptr[5] = float2int8(((intptr[5] * scale_in) + bias) * scale_out);
  133. ptr[6] = float2int8(((intptr[6] * scale_in) + bias) * scale_out);
  134. ptr[7] = float2int8(((intptr[7] * scale_in) + bias) * scale_out);
  135. ptr += 8;
  136. intptr += 8;
  137. }
  138. #else
  139. if (nn > 0)
  140. {
  141. asm volatile(
  142. "pld [%1, #256] \n"
  143. "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
  144. "vdup.f32 q10, %6 \n" //q10 scale_in
  145. "vdup.f32 q11, %7 \n" //q11 scale_out
  146. "vdup.f32 q12, %8 \n" //q12 bias
  147. "0: \n"
  148. // top_s32 -> top_f32
  149. "vcvt.f32.s32 q0, q0 \n"
  150. "vcvt.f32.s32 q1, q1 \n"
  151. // top_f32 = top_f32 * scale_int
  152. "vmul.f32 q0, q0, q10 \n"
  153. "vmul.f32 q1, q1, q10 \n"
  154. // top_f32 = top_f32 + bias
  155. "vadd.f32 q0, q0, q12 \n"
  156. "vadd.f32 q1, q1, q12 \n"
  157. // top_f32 = top_f32 * scale_out
  158. "vmul.f32 q0, q0, q11 \n"
  159. "vmul.f32 q1, q1, q11 \n"
  160. // top_f32 -> top_s32
  161. "vcvtr.s32.f32 s0, s0 \n"
  162. "vcvtr.s32.f32 s1, s1 \n"
  163. "vcvtr.s32.f32 s2, s2 \n"
  164. "vcvtr.s32.f32 s3, s3 \n"
  165. "vcvtr.s32.f32 s4, s4 \n"
  166. "vcvtr.s32.f32 s5, s5 \n"
  167. "vcvtr.s32.f32 s6, s6 \n"
  168. "vcvtr.s32.f32 s7, s7 \n"
  169. // top_s32 -> top_s16
  170. "vqmovn.s32 d4, q0 \n"
  171. "vqmovn.s32 d5, q1 \n"
  172. "pld [%1, #256] \n"
  173. "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
  174. // top_s16 -> top_s8
  175. "vqmovn.s16 d4, q2 \n"
  176. // save top_s8
  177. "vst1.8 {d4}, [%2:64]! \n"
  178. "subs %0, #1 \n"
  179. "bne 0b \n"
  180. "sub %1, #32 \n"
  181. : "=r"(nn), // %0
  182. "=r"(intptr), // %1
  183. "=r"(ptr) // %2
  184. : "0"(nn),
  185. "1"(intptr),
  186. "2"(ptr),
  187. "r"(scale_in), // %6
  188. "r"(scale_out), // %7
  189. "r"(bias) // %8
  190. : "cc", "memory", "q0", "q1", "q2", "q10", "q11", "q12"
  191. );
  192. }
  193. #endif // __aarch64__
  194. #else
  195. int remain = size;
  196. #endif // __ARM_NEON
  197. for (; remain > 0; remain--)
  198. {
  199. *ptr = float2int8(((*intptr * scale_in) + bias) * scale_out);
  200. intptr++;
  201. ptr ++;
  202. }
  203. }
  204. }
  205. else
  206. {
  207. #pragma omp parallel for num_threads(opt.num_threads)
  208. for (int q=0; q<channels; q++)
  209. {
  210. const int* intptr = bottom_blob.channel(q);
  211. signed char* ptr = top_blob.channel(q);
  212. #if __ARM_NEON
  213. int nn = size >> 3;
  214. int remain = size & 7;
  215. #if __aarch64__
  216. //TODO
  217. for (; nn>0; nn--)
  218. {
  219. ptr[0] = float2int8(intptr[0] * scale_in * scale_out);
  220. ptr[1] = float2int8(intptr[1] * scale_in * scale_out);
  221. ptr[2] = float2int8(intptr[2] * scale_in * scale_out);
  222. ptr[3] = float2int8(intptr[3] * scale_in * scale_out);
  223. ptr[4] = float2int8(intptr[4] * scale_in * scale_out);
  224. ptr[5] = float2int8(intptr[5] * scale_in * scale_out);
  225. ptr[6] = float2int8(intptr[6] * scale_in * scale_out);
  226. ptr[7] = float2int8(intptr[7] * scale_in * scale_out);
  227. ptr += 8;
  228. intptr += 8;
  229. }
  230. #else
  231. if (nn > 0)
  232. {
  233. asm volatile(
  234. "pld [%1, #256] \n"
  235. "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
  236. "vdup.f32 q10, %6 \n" //q10 scale_in
  237. "vdup.f32 q11, %7 \n" //q11 scale_out
  238. "0: \n"
  239. // top_s32 -> top_f32
  240. "vcvt.f32.s32 q0, q0 \n"
  241. "vcvt.f32.s32 q1, q1 \n"
  242. // top_f32 = top_f32 * scale_int
  243. "vmul.f32 q0, q0, q10 \n"
  244. "vmul.f32 q1, q1, q10 \n"
  245. // top_f32 = top_f32 * scale_out
  246. "vmul.f32 q0, q0, q11 \n"
  247. "vmul.f32 q1, q1, q11 \n"
  248. // top_f32 -> top_s32
  249. "vcvtr.s32.f32 s0, s0 \n"
  250. "vcvtr.s32.f32 s1, s1 \n"
  251. "vcvtr.s32.f32 s2, s2 \n"
  252. "vcvtr.s32.f32 s3, s3 \n"
  253. "vcvtr.s32.f32 s4, s4 \n"
  254. "vcvtr.s32.f32 s5, s5 \n"
  255. "vcvtr.s32.f32 s6, s6 \n"
  256. "vcvtr.s32.f32 s7, s7 \n"
  257. // top_s32 -> top_s16
  258. "vqmovn.s32 d4, q0 \n"
  259. "vqmovn.s32 d5, q1 \n"
  260. "pld [%1, #256] \n"
  261. "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
  262. // top_s16 -> top_s8
  263. "vqmovn.s16 d4, q2 \n"
  264. // save top_s8
  265. "vst1.8 {d4}, [%2:64]! \n"
  266. "subs %0, #1 \n"
  267. "bne 0b \n"
  268. "sub %1, #32 \n"
  269. : "=r"(nn), // %0
  270. "=r"(intptr), // %1
  271. "=r"(ptr) // %2
  272. : "0"(nn),
  273. "1"(intptr),
  274. "2"(ptr),
  275. "r"(scale_in), // %6
  276. "r"(scale_out) // %7
  277. : "cc", "memory", "q0", "q1", "q2", "q10", "q11"
  278. );
  279. }
  280. #endif // __aarch64__
  281. #else
  282. int remain = size;
  283. #endif // __ARM_NEON
  284. for (; remain > 0; remain--)
  285. {
  286. *ptr = float2int8(*intptr * scale_in * scale_out);
  287. intptr++;
  288. ptr ++;
  289. }
  290. }
  291. }
  292. }
  293. return 0;
  294. }
  295. } // namespace ncnn