 new int8 implement,better accuracy (#749)
* add the armv7a conv3x3s1 implement without overflow,remove old codes
* fix the bug of conv3x3s2 packed int8
* new int8 implement,weight quant by perchanel,better accuracy~
* fix the bug of conv3x3s1 packed int8 neon
* add the naive c fp32 and int8 winograd F(2,3)
* add the neon intrinsic int8 winograd F(2,3)
* optimize the armv7a int8 winograd F(2,3) with neon assembly
* optimize the armv7a int8 winograd F(2,3) input transform with assembly.
* add the requantize layer and int8 relu implement.
* add graph optimize conv1x1s2 -> conv1x1s1,begin optimize int8 aarch64.
* fix int8 bugs
* add the c naive im2col with sgemm
* add aarch64 int8 winograd f23, conv3x3s2 naive implement
* add the int8 sgemm conv7x7s2 on x86/armv7a platform
* optimize the int8 sgemm by neon intrinsic and packed kernel
* optimize the int8 sgemm with packed data
* optimize the int8 sgemm with armv7a neon assembly
* add the int8 sgemm on arm64-v8a platform
* perpare to merge latest codes from master
* add the int8 param files
* In the Class Net,add the fuse_network method
7 years ago |
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325 |
- // SenseNets is pleased to support the open source community by supporting ncnn available.
- //
- // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
- //
- // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
- // in compliance with the License. You may obtain a copy of the License at
- //
- // https://opensource.org/licenses/BSD-3-Clause
- //
- // Unless required by applicable law or agreed to in writing, software distributed
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
- // specific language governing permissions and limitations under the License.
-
- #include "requantize_arm.h"
-
- #include <math.h>
-
- #if __ARM_NEON
- #include <arm_neon.h>
- #endif // __ARM_NEON
-
- namespace ncnn {
-
- DEFINE_LAYER_CREATOR(Requantize_arm)
-
- static inline signed char float2int8(float v)
- {
- int int32 = round(v);
- if (int32 > 127) return 127;
- if (int32 < -128) return -128;
- return (signed char)int32;
- }
-
- int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
- {
- int dims = bottom_blob.dims;
-
- if (dims == 1)
- {
- int w = bottom_blob.w;
-
- const int* intptr = bottom_blob;
- signed char * ptr = top_blob;
-
- if (bias_term)
- {
- if (bias_data_size > 1)
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int i=0; i<w; i++)
- {
- ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
- if (fusion_relu && ptr[i] < 0)
- ptr[i] = 0;
- }
- }
- else
- {
- float bias = bias_data[0];
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int i=0; i<w; i++)
- {
- ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
- if (fusion_relu && ptr[i] < 0)
- ptr[i] = 0;
- }
- }
- }
- else
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int i=0; i<w; i++)
- {
- ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
- if (fusion_relu && ptr[i] < 0)
- ptr[i] = 0;
- }
- }
- }
-
- if (dims == 2)
- {
- int w = bottom_blob.w;
- int h = bottom_blob.h;
-
- if (bias_term)
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int i=0; i<h; i++)
- {
- const int* intptr = bottom_blob.row<const int>(i);
- signed char* ptr = top_blob.row<signed char>(i);
-
- float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];
-
- for (int j=0; j<w; j++)
- {
- ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
- if (fusion_relu && ptr[j] < 0)
- ptr[j] = 0;
- }
- }
- }
- else
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int i=0; i<h; i++)
- {
- const int* intptr = bottom_blob.row<const int>(i);
- signed char* ptr = top_blob.row<signed char>(i);
-
- for (int j=0; j<w; j++)
- {
- ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
- if (fusion_relu && ptr[j] < 0)
- ptr[j] = 0;
- }
- }
- }
- }
-
- if (dims == 3)
- {
- int w = bottom_blob.w;
- int h = bottom_blob.h;
- int channels = bottom_blob.c;
- int size = w * h;
-
- if (bias_term)
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int q=0; q<channels; q++)
- {
- const int* intptr = bottom_blob.channel(q);
- signed char* ptr = top_blob.channel(q);
-
- float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];
-
- #if __ARM_NEON
- int nn = size >> 3;
- int remain = size & 7;
-
- #if __aarch64__
- for (; nn>0; nn--)
- {
- ptr[0] = float2int8(((intptr[0] * scale_in) + bias) * scale_out);
- ptr[1] = float2int8(((intptr[1] * scale_in) + bias) * scale_out);
- ptr[2] = float2int8(((intptr[2] * scale_in) + bias) * scale_out);
- ptr[3] = float2int8(((intptr[3] * scale_in) + bias) * scale_out);
- ptr[4] = float2int8(((intptr[4] * scale_in) + bias) * scale_out);
- ptr[5] = float2int8(((intptr[5] * scale_in) + bias) * scale_out);
- ptr[6] = float2int8(((intptr[6] * scale_in) + bias) * scale_out);
- ptr[7] = float2int8(((intptr[7] * scale_in) + bias) * scale_out);
-
- ptr += 8;
- intptr += 8;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "pld [%1, #256] \n"
- "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
- "vdup.f32 q10, %6 \n" //q10 scale_in
- "vdup.f32 q11, %7 \n" //q11 scale_out
- "vdup.f32 q12, %8 \n" //q12 bias
- "0: \n"
- // top_s32 -> top_f32
- "vcvt.f32.s32 q0, q0 \n"
- "vcvt.f32.s32 q1, q1 \n"
- // top_f32 = top_f32 * scale_int
- "vmul.f32 q0, q0, q10 \n"
- "vmul.f32 q1, q1, q10 \n"
- // top_f32 = top_f32 + bias
- "vadd.f32 q0, q0, q12 \n"
- "vadd.f32 q1, q1, q12 \n"
- // top_f32 = top_f32 * scale_out
- "vmul.f32 q0, q0, q11 \n"
- "vmul.f32 q1, q1, q11 \n"
- // top_f32 -> top_s32
- "vcvtr.s32.f32 s0, s0 \n"
- "vcvtr.s32.f32 s1, s1 \n"
- "vcvtr.s32.f32 s2, s2 \n"
- "vcvtr.s32.f32 s3, s3 \n"
- "vcvtr.s32.f32 s4, s4 \n"
- "vcvtr.s32.f32 s5, s5 \n"
- "vcvtr.s32.f32 s6, s6 \n"
- "vcvtr.s32.f32 s7, s7 \n"
- // top_s32 -> top_s16
- "vqmovn.s32 d4, q0 \n"
- "vqmovn.s32 d5, q1 \n"
- "pld [%1, #256] \n"
- "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
- // top_s16 -> top_s8
- "vqmovn.s16 d4, q2 \n"
- // save top_s8
- "vst1.8 {d4}, [%2:64]! \n"
- "subs %0, #1 \n"
- "bne 0b \n"
- "sub %1, #32 \n"
- : "=r"(nn), // %0
- "=r"(intptr), // %1
- "=r"(ptr) // %2
- : "0"(nn),
- "1"(intptr),
- "2"(ptr),
- "r"(scale_in), // %6
- "r"(scale_out), // %7
- "r"(bias) // %8
- : "cc", "memory", "q0", "q1", "q2", "q10", "q11", "q12"
- );
- }
- #endif // __aarch64__
- #else
- int remain = size;
- #endif // __ARM_NEON
-
- for (; remain > 0; remain--)
- {
- *ptr = float2int8(((*intptr * scale_in) + bias) * scale_out);
-
- intptr++;
- ptr ++;
- }
- }
- }
- else
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int q=0; q<channels; q++)
- {
- const int* intptr = bottom_blob.channel(q);
- signed char* ptr = top_blob.channel(q);
-
- #if __ARM_NEON
- int nn = size >> 3;
- int remain = size & 7;
-
- #if __aarch64__
- //TODO
- for (; nn>0; nn--)
- {
- ptr[0] = float2int8(intptr[0] * scale_in * scale_out);
- ptr[1] = float2int8(intptr[1] * scale_in * scale_out);
- ptr[2] = float2int8(intptr[2] * scale_in * scale_out);
- ptr[3] = float2int8(intptr[3] * scale_in * scale_out);
- ptr[4] = float2int8(intptr[4] * scale_in * scale_out);
- ptr[5] = float2int8(intptr[5] * scale_in * scale_out);
- ptr[6] = float2int8(intptr[6] * scale_in * scale_out);
- ptr[7] = float2int8(intptr[7] * scale_in * scale_out);
-
- ptr += 8;
- intptr += 8;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "pld [%1, #256] \n"
- "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
- "vdup.f32 q10, %6 \n" //q10 scale_in
- "vdup.f32 q11, %7 \n" //q11 scale_out
- "0: \n"
- // top_s32 -> top_f32
- "vcvt.f32.s32 q0, q0 \n"
- "vcvt.f32.s32 q1, q1 \n"
- // top_f32 = top_f32 * scale_int
- "vmul.f32 q0, q0, q10 \n"
- "vmul.f32 q1, q1, q10 \n"
- // top_f32 = top_f32 * scale_out
- "vmul.f32 q0, q0, q11 \n"
- "vmul.f32 q1, q1, q11 \n"
- // top_f32 -> top_s32
- "vcvtr.s32.f32 s0, s0 \n"
- "vcvtr.s32.f32 s1, s1 \n"
- "vcvtr.s32.f32 s2, s2 \n"
- "vcvtr.s32.f32 s3, s3 \n"
- "vcvtr.s32.f32 s4, s4 \n"
- "vcvtr.s32.f32 s5, s5 \n"
- "vcvtr.s32.f32 s6, s6 \n"
- "vcvtr.s32.f32 s7, s7 \n"
- // top_s32 -> top_s16
- "vqmovn.s32 d4, q0 \n"
- "vqmovn.s32 d5, q1 \n"
- "pld [%1, #256] \n"
- "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
- // top_s16 -> top_s8
- "vqmovn.s16 d4, q2 \n"
- // save top_s8
- "vst1.8 {d4}, [%2:64]! \n"
- "subs %0, #1 \n"
- "bne 0b \n"
- "sub %1, #32 \n"
- : "=r"(nn), // %0
- "=r"(intptr), // %1
- "=r"(ptr) // %2
- : "0"(nn),
- "1"(intptr),
- "2"(ptr),
- "r"(scale_in), // %6
- "r"(scale_out) // %7
- : "cc", "memory", "q0", "q1", "q2", "q10", "q11"
- );
- }
- #endif // __aarch64__
- #else
- int remain = size;
- #endif // __ARM_NEON
-
- for (; remain > 0; remain--)
- {
- *ptr = float2int8(*intptr * scale_in * scale_out);
-
- intptr++;
- ptr ++;
- }
- }
- }
- }
-
- return 0;
- }
-
- } // namespace ncnn
|