You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dequantize_arm.cpp 9.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2019 BUG1989. All rights reserved.
  4. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  5. //
  6. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  7. // in compliance with the License. You may obtain a copy of the License at
  8. //
  9. // https://opensource.org/licenses/BSD-3-Clause
  10. //
  11. // Unless required by applicable law or agreed to in writing, software distributed
  12. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  13. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  14. // specific language governing permissions and limitations under the License.
  15. #include "dequantize_arm.h"
  16. namespace ncnn {
  17. DEFINE_LAYER_CREATOR(Dequantize_arm)
  18. int Dequantize_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
  19. {
  20. int dims = bottom_top_blob.dims;
  21. if (dims == 1)
  22. {
  23. int w = bottom_top_blob.w;
  24. int* intptr = bottom_top_blob;
  25. float* ptr = bottom_top_blob;
  26. if (bias_term)
  27. {
  28. #pragma omp parallel for num_threads(opt.num_threads)
  29. for (int i=0; i<w; i++)
  30. {
  31. ptr[i] = intptr[i] * scale + bias_data[i];
  32. }
  33. }
  34. else
  35. {
  36. #pragma omp parallel for num_threads(opt.num_threads)
  37. for (int i=0; i<w; i++)
  38. {
  39. ptr[i] = intptr[i] * scale;
  40. }
  41. }
  42. }
  43. if (dims == 2)
  44. {
  45. int w = bottom_top_blob.w;
  46. int h = bottom_top_blob.h;
  47. if (bias_term)
  48. {
  49. #pragma omp parallel for num_threads(opt.num_threads)
  50. for (int i=0; i<h; i++)
  51. {
  52. const int* intptr = bottom_top_blob.row<const int>(i);
  53. float* ptr = bottom_top_blob.row(i);
  54. float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];
  55. for (int j=0; j<w; j++)
  56. {
  57. ptr[j] = intptr[j] * scale + bias;
  58. }
  59. }
  60. }
  61. else
  62. {
  63. #pragma omp parallel for num_threads(opt.num_threads)
  64. for (int i=0; i<h; i++)
  65. {
  66. const int* intptr = bottom_top_blob.row<const int>(i);
  67. float* ptr = bottom_top_blob.row(i);
  68. for (int j=0; j<w; j++)
  69. {
  70. ptr[j] = intptr[j] * scale;
  71. }
  72. }
  73. }
  74. }
  75. if (dims == 3)
  76. {
  77. int w = bottom_top_blob.w;
  78. int h = bottom_top_blob.h;
  79. int channels = bottom_top_blob.c;
  80. int size = w * h;
  81. if (bias_term)
  82. {
  83. #pragma omp parallel for num_threads(opt.num_threads)
  84. for (int q=0; q<channels; q++)
  85. {
  86. int* intptr = bottom_top_blob.channel(q);
  87. float* ptr = bottom_top_blob.channel(q);
  88. float bias = bias_data[q];
  89. #if __ARM_NEON
  90. int nn = size >> 3;
  91. int remain = size & 7;
  92. #else
  93. int remain = size;
  94. #endif // __ARM_NEON
  95. #if __ARM_NEON
  96. #if __aarch64__
  97. if (nn > 0)
  98. {
  99. asm volatile(
  100. "dup v2.4s, %w6 \n" // scale
  101. "dup v3.4s, %w7 \n" // bias
  102. "0: \n"
  103. "prfm pldl1keep, [%1, #128] \n"
  104. "ld1 {v0.4s, v1.4s}, [%1], #32 \n" // data
  105. // top_s32 -> top_f32
  106. "scvtf v5.4s, v0.4s \n"
  107. "scvtf v6.4s, v1.4s \n"
  108. // top_f32 = top_f32 * scale_out
  109. "fmul v5.4s, v5.4s, v2.4s \n"
  110. "fmul v6.4s, v6.4s, v2.4s \n"
  111. // top_f32 = top_f32 + bias_tm
  112. "fadd v5.4s, v5.4s, v3.4s \n"
  113. "fadd v6.4s, v6.4s, v3.4s \n"
  114. // save top_f32
  115. "st1 {v5.4s, v6.4s}, [%2], #32 \n"
  116. "subs %w0, %w0, #1 \n"
  117. "bne 0b \n"
  118. : "=r"(nn), // %0
  119. "=r"(intptr), // %1
  120. "=r"(ptr) // %2
  121. : "0"(nn),
  122. "1"(intptr),
  123. "2"(ptr),
  124. "r"(scale), // %6
  125. "r"(bias) // %7
  126. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
  127. );
  128. }
  129. #else
  130. if (nn > 0)
  131. {
  132. asm volatile(
  133. "pld [%1, #256] \n"
  134. "vld1.s32 {d0-d3}, [%1]! \n" //q0-q1 data
  135. "vdup.f32 q10, %6 \n" //q10 scale
  136. "vdup.f32 q12, %7 \n" //q12 bias
  137. "0: \n"
  138. "vcvt.f32.s32 q0, q0 \n"
  139. "vcvt.f32.s32 q1, q1 \n"
  140. "vmul.f32 q0,q0,q10 \n"
  141. "vmul.f32 q1,q1,q10 \n"
  142. "vadd.f32 q2,q0,q12 \n"
  143. "vadd.f32 q3,q1,q12 \n"
  144. "pld [%1, #256] \n"
  145. "vld1.s32 {d0-d3}, [%1]! \n"
  146. "vst1.f32 {d4-d7}, [%2]! \n"
  147. "subs %0, #1 \n"
  148. "bne 0b \n"
  149. "sub %1, #32 \n"
  150. : "=r"(nn), // %0
  151. "=r"(intptr), // %1
  152. "=r"(ptr) // %2
  153. : "0"(nn),
  154. "1"(intptr),
  155. "2"(ptr),
  156. "r"(scale), // %6
  157. "r"(bias) // %7
  158. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q12"
  159. );
  160. }
  161. #endif // __aarch64__
  162. #endif // __ARM_NEON
  163. for (; remain>0; remain--)
  164. {
  165. *ptr = *intptr * scale + bias;
  166. intptr++;
  167. ptr++;
  168. }
  169. }
  170. }
  171. else
  172. {
  173. #pragma omp parallel for num_threads(opt.num_threads)
  174. for (int q=0; q<channels; q++)
  175. {
  176. int* intptr = bottom_top_blob.channel(q);
  177. float* ptr = bottom_top_blob.channel(q);
  178. #if __ARM_NEON
  179. int nn = size >> 3;
  180. int remain = size & 7;
  181. #else
  182. int remain = size;
  183. #endif // __ARM_NEON
  184. #if __ARM_NEON
  185. #if __aarch64__
  186. if (nn > 0)
  187. {
  188. asm volatile(
  189. "dup v2.4s, %w6 \n" // scale
  190. "0: \n"
  191. "prfm pldl1keep, [%1, #128] \n"
  192. "ld1 {v0.4s, v1.4s}, [%1], #32 \n" // data
  193. // top_s32 -> top_f32
  194. "scvtf v5.4s, v0.4s \n"
  195. "scvtf v6.4s, v1.4s \n"
  196. // top_f32 = top_f32 * scale_out
  197. "fmul v5.4s, v5.4s, v2.4s \n"
  198. "fmul v6.4s, v6.4s, v2.4s \n"
  199. // save top_f32
  200. "st1 {v5.4s, v6.4s}, [%2], #32 \n"
  201. "subs %w0, %w0, #1 \n"
  202. "bne 0b \n"
  203. : "=r"(nn), // %0
  204. "=r"(intptr), // %1
  205. "=r"(ptr) // %2
  206. : "0"(nn),
  207. "1"(intptr),
  208. "2"(ptr),
  209. "r"(scale) // %6
  210. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
  211. );
  212. }
  213. #else
  214. if (nn > 0)
  215. {
  216. asm volatile(
  217. "pld [%1, #256] \n"
  218. "vld1.s32 {d0-d3}, [%1]! \n" //q0-q1 data
  219. "vdup.f32 q10, %6 \n" //q10 scale
  220. "0: \n"
  221. "vcvt.f32.s32 q0, q0 \n"
  222. "vcvt.f32.s32 q1, q1 \n"
  223. "vmul.f32 q2,q0,q10 \n"
  224. "vmul.f32 q3,q1,q10 \n"
  225. "pld [%1, #256] \n"
  226. "vld1.s32 {d0-d3}, [%1]! \n"
  227. "vst1.f32 {d4-d7}, [%2]! \n"
  228. "subs %0, #1 \n"
  229. "bne 0b \n"
  230. "sub %1, #32 \n"
  231. : "=r"(nn), // %0
  232. "=r"(intptr), // %1
  233. "=r"(ptr) // %2
  234. : "0"(nn),
  235. "1"(intptr),
  236. "2"(ptr),
  237. "r"(scale) // %6
  238. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q12"
  239. );
  240. }
  241. #endif // __aarch64__
  242. #endif // __ARM_NEON
  243. for (; remain>0; remain--)
  244. {
  245. *ptr = *intptr * scale;
  246. intptr++;
  247. ptr++;
  248. }
  249. }
  250. }
  251. }
  252. return 0;
  253. }
  254. } // namespace ncnn