You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

quantize_arm.cpp 6.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
  4. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  5. //
  6. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  7. // in compliance with the License. You may obtain a copy of the License at
  8. //
  9. // https://opensource.org/licenses/BSD-3-Clause
  10. //
  11. // Unless required by applicable law or agreed to in writing, software distributed
  12. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  13. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  14. // specific language governing permissions and limitations under the License.
  15. #include "quantize_arm.h"
  16. #include <math.h>
  17. namespace ncnn {
  18. DEFINE_LAYER_CREATOR(Quantize_arm)
  19. static inline signed char float2int8(float v)
  20. {
  21. int int32 = round(v);
  22. if (int32 > 127) return 127;
  23. if (int32 < -128) return -128;
  24. return (signed char)int32;
  25. }
  26. int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  27. {
  28. int dims = bottom_blob.dims;
  29. if (dims == 1)
  30. {
  31. int w = bottom_blob.w;
  32. top_blob.create(w, (size_t)1u, opt.blob_allocator);
  33. if (top_blob.empty())
  34. return -100;
  35. const float* ptr = bottom_blob;
  36. signed char* outptr = top_blob;
  37. #pragma omp parallel for num_threads(opt.num_threads)
  38. for (int i=0; i<w; i++)
  39. {
  40. outptr[i] = float2int8(ptr[i] * scale);
  41. }
  42. }
  43. if (dims == 2)
  44. {
  45. int w = bottom_blob.w;
  46. int h = bottom_blob.h;
  47. int size = w * h;
  48. top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
  49. if (top_blob.empty())
  50. return -100;
  51. const float* ptr = bottom_blob;
  52. signed char* outptr = top_blob;
  53. #pragma omp parallel for num_threads(opt.num_threads)
  54. for (int i=0; i<size; i++)
  55. {
  56. outptr[i] = float2int8(ptr[i] * scale);
  57. }
  58. }
  59. if (dims == 3)
  60. {
  61. int w = bottom_blob.w;
  62. int h = bottom_blob.h;
  63. int channels = bottom_blob.c;
  64. int size = w * h;
  65. top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
  66. if (top_blob.empty())
  67. return -100;
  68. #pragma omp parallel for num_threads(opt.num_threads)
  69. for (int q=0; q<channels; q++)
  70. {
  71. const float* ptr = bottom_blob.channel(q);
  72. signed char* outptr = top_blob.channel(q);
  73. #if __ARM_NEON
  74. int nn = size >> 3;
  75. int remain = size & 7;
  76. #else
  77. int remain = size;
  78. #endif // __ARM_NEON
  79. #if __ARM_NEON
  80. #if __aarch64__
  81. float32x4_t _scale = vdupq_n_f32(scale);
  82. if (nn > 0)
  83. {
  84. asm volatile(
  85. "dup v2.4s, %w6 \n" //scale
  86. "0: \n"
  87. "prfm pldl1keep, [%1, #128] \n"
  88. "ld1 {v0.4s, v1.4s}, [%1], #32 \n" //data
  89. // bottom_f32 = bottom_f32 * scale
  90. "fmul v3.4s, v0.4s, v2.4s \n"
  91. "fmul v4.4s, v1.4s, v2.4s \n"
  92. // top_f32 -> top_s32
  93. "fcvtas v5.4s, v3.4s \n"
  94. "fcvtas v6.4s, v4.4s \n"
  95. // top_s32 -> top_s16
  96. "sqxtn v7.4h, v5.4s \n"
  97. "sqxtn2 v7.8h, v6.4s \n"
  98. // top_s16 -> top_s8
  99. "sqxtn v8.8b, v7.8h \n"
  100. // save top_s8
  101. "st1 {v8.8b}, [%2], #8 \n"
  102. "subs %w0, %w0, #1 \n"
  103. "bne 0b \n"
  104. : "=r"(nn), // %0
  105. "=r"(ptr), // %1
  106. "=r"(outptr) // %2
  107. : "0"(nn),
  108. "1"(ptr),
  109. "2"(outptr),
  110. "r"(_scale) // %6
  111. : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"
  112. );
  113. }
  114. #else
  115. if (nn > 0)
  116. {
  117. asm volatile(
  118. "pld [%1, #256] \n"
  119. "vld1.f32 {d0-d3}, [%1]! \n"
  120. "vdup.32 q10, %6 \n"
  121. "0: \n"
  122. "vmul.f32 q0,q0,q10 \n"
  123. "vmul.f32 q1,q1,q10 \n"
  124. "vcvtr.s32.f32 s0,s0 \n"
  125. "vcvtr.s32.f32 s1,s1 \n"
  126. "vcvtr.s32.f32 s2,s2 \n"
  127. "vcvtr.s32.f32 s3,s3 \n"
  128. "vcvtr.s32.f32 s4,s4 \n"
  129. "vcvtr.s32.f32 s5,s5 \n"
  130. "vcvtr.s32.f32 s6,s6 \n"
  131. "vcvtr.s32.f32 s7,s7 \n"
  132. "vqmovn.s32 d4,q0 \n"
  133. "vqmovn.s32 d5,q1 \n"
  134. "pld [%1, #256] \n"
  135. "vld1.f32 {d0-d3}, [%1]! \n"
  136. "vqmovn.s16 d4, q2 \n"
  137. "vst1.8 {d4}, [%2]! \n"
  138. "subs %0, #1 \n"
  139. "bne 0b \n"
  140. "sub %1, #32 \n"
  141. : "=r"(nn), // %0
  142. "=r"(ptr), // %1
  143. "=r"(outptr) // %2
  144. : "0"(nn),
  145. "1"(ptr),
  146. "2"(outptr),
  147. "r"(scale) // %6
  148. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11"
  149. );
  150. }
  151. #endif // __aarch64__
  152. #endif // __ARM_NEON
  153. for (; remain>0; remain--)
  154. {
  155. *outptr = float2int8(*ptr * scale);
  156. ptr++;
  157. outptr++;
  158. }
  159. }
  160. }
  161. return 0;
  162. }
  163. } // namespace ncnn