|
|
|
@@ -30,42 +30,212 @@ static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const M |
|
|
|
{ |
|
|
|
Mat out = top_blob.channel(p); |
|
|
|
|
|
|
|
const signed char *kernel0 = (const signed char *)_kernel + p * 9; |
|
|
|
const signed char* kernel = (const signed char *)_kernel + p*9; |
|
|
|
|
|
|
|
int* outptr0 = out; |
|
|
|
int* outptr0n = outptr0 + outw; |
|
|
|
|
|
|
|
const signed char* img0 = bottom_blob.channel(p); |
|
|
|
|
|
|
|
const signed char* r0 = img0; |
|
|
|
const signed char* r1 = img0 + w; |
|
|
|
const signed char* r2 = img0 + w*2; |
|
|
|
const signed char* r3 = img0 + w*3; |
|
|
|
|
|
|
|
int i = 0; |
|
|
|
|
|
|
|
int8x8_t _k0 = vdup_n_s8(kernel[0]); |
|
|
|
int8x8_t _k1 = vdup_n_s8(kernel[1]); |
|
|
|
int8x8_t _k2 = vdup_n_s8(kernel[2]); |
|
|
|
|
|
|
|
int8x8_t _k3 = vdup_n_s8(kernel[3]); |
|
|
|
int8x8_t _k4 = vdup_n_s8(kernel[4]); |
|
|
|
int8x8_t _k5 = vdup_n_s8(kernel[5]); |
|
|
|
|
|
|
|
int *outptr = out; |
|
|
|
int8x8_t _k6 = vdup_n_s8(kernel[6]); |
|
|
|
int8x8_t _k7 = vdup_n_s8(kernel[7]); |
|
|
|
int8x8_t _k8 = vdup_n_s8(kernel[8]); |
|
|
|
|
|
|
|
const signed char *img0 = bottom_blob.channel(p); |
|
|
|
for (; i+1 < outh; i+=2) |
|
|
|
{ |
|
|
|
int nn = outw >> 3; |
|
|
|
int remain = outw & 7; |
|
|
|
|
|
|
|
const signed char *r0 = img0; |
|
|
|
const signed char *r1 = img0 + w; |
|
|
|
const signed char *r2 = img0 + w * 2; |
|
|
|
for (; nn >0; nn--) |
|
|
|
{ |
|
|
|
int8x8_t _r0 = vld1_s8(r0); |
|
|
|
int8x8_t _r0n = vld1_s8(r0+8); |
|
|
|
int8x8_t _r01 = vext_s8(_r0, _r0n, 1); |
|
|
|
int8x8_t _r02 = vext_s8(_r0, _r0n, 2); |
|
|
|
|
|
|
|
int16x8_t _sum0 = vmull_s8(_r0, _k0); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r01, _k1); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r02, _k2); |
|
|
|
|
|
|
|
int8x8_t _r1 = vld1_s8(r1); |
|
|
|
int8x8_t _r1n = vld1_s8(r1+8); |
|
|
|
int8x8_t _r11 = vext_s8(_r1, _r1n, 1); |
|
|
|
int8x8_t _r12 = vext_s8(_r1, _r1n, 2); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r1, _k3); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r11, _k4); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r12, _k5); |
|
|
|
|
|
|
|
int16x8_t _sum1 = vmull_s8(_r1, _k0); |
|
|
|
_sum1 = vmlal_s8(_sum1, _r11, _k1); |
|
|
|
_sum1 = vmlal_s8(_sum1, _r12, _k2); |
|
|
|
|
|
|
|
int8x8_t _r2 = vld1_s8(r2); |
|
|
|
int8x8_t _r2n = vld1_s8(r2+8); |
|
|
|
int8x8_t _r21 = vext_s8(_r2, _r2n, 1); |
|
|
|
int8x8_t _r22 = vext_s8(_r2, _r2n, 2); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r2, _k6); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r21, _k7); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r22, _k8); |
|
|
|
|
|
|
|
_sum1 = vmlal_s8(_sum1, _r2, _k3); |
|
|
|
_sum1 = vmlal_s8(_sum1, _r21, _k4); |
|
|
|
_sum1 = vmlal_s8(_sum1, _r22, _k5); |
|
|
|
|
|
|
|
int8x8_t _r3 = vld1_s8(r3); |
|
|
|
int8x8_t _r3n = vld1_s8(r3+8); |
|
|
|
int8x8_t _r31 = vext_s8(_r3, _r3n, 1); |
|
|
|
int8x8_t _r32 = vext_s8(_r3, _r3n, 2); |
|
|
|
_sum1 = vmlal_s8(_sum1, _r3, _k6); |
|
|
|
_sum1 = vmlal_s8(_sum1, _r31, _k7); |
|
|
|
_sum1 = vmlal_s8(_sum1, _r32, _k8); |
|
|
|
|
|
|
|
int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0)); |
|
|
|
int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0)); |
|
|
|
|
|
|
|
vst1q_s32(outptr0, sum0_s32); |
|
|
|
vst1q_s32(outptr0+4, sum0n_s32); |
|
|
|
|
|
|
|
int32x4_t sum1_s32 = vmovl_s16(vget_low_s16(_sum1)); |
|
|
|
int32x4_t sum1n_s32 = vmovl_s16(vget_high_s16(_sum1)); |
|
|
|
|
|
|
|
vst1q_s32(outptr0n, sum1_s32); |
|
|
|
vst1q_s32(outptr0n+4, sum1n_s32); |
|
|
|
|
|
|
|
r0 += 8; |
|
|
|
r1 += 8; |
|
|
|
r2 += 8; |
|
|
|
r3 += 8; |
|
|
|
outptr0 += 8; |
|
|
|
outptr0n += 8; |
|
|
|
} |
|
|
|
|
|
|
|
for (; remain>0; remain--) |
|
|
|
{ |
|
|
|
//Todo Neon |
|
|
|
|
|
|
|
int sum0 = 0; |
|
|
|
int sum0n = 0; |
|
|
|
|
|
|
|
sum0 += (int)r0[0] * kernel[0]; |
|
|
|
sum0 += (int)r0[1] * kernel[1]; |
|
|
|
sum0 += (int)r0[2] * kernel[2]; |
|
|
|
sum0 += (int)r1[0] * kernel[3]; |
|
|
|
sum0 += (int)r1[1] * kernel[4]; |
|
|
|
sum0 += (int)r1[2] * kernel[5]; |
|
|
|
sum0 += (int)r2[0] * kernel[6]; |
|
|
|
sum0 += (int)r2[1] * kernel[7]; |
|
|
|
sum0 += (int)r2[2] * kernel[8]; |
|
|
|
|
|
|
|
sum0n += (int)r1[0] * kernel[0]; |
|
|
|
sum0n += (int)r1[1] * kernel[1]; |
|
|
|
sum0n += (int)r1[2] * kernel[2]; |
|
|
|
sum0n += (int)r2[0] * kernel[3]; |
|
|
|
sum0n += (int)r2[1] * kernel[4]; |
|
|
|
sum0n += (int)r2[2] * kernel[5]; |
|
|
|
sum0n += (int)r3[0] * kernel[6]; |
|
|
|
sum0n += (int)r3[1] * kernel[7]; |
|
|
|
sum0n += (int)r3[2] * kernel[8]; |
|
|
|
|
|
|
|
*outptr0 = sum0; |
|
|
|
*outptr0n = sum0n; |
|
|
|
|
|
|
|
r0++; |
|
|
|
r1++; |
|
|
|
r2++; |
|
|
|
r3++; |
|
|
|
outptr0++; |
|
|
|
outptr0n++; |
|
|
|
} |
|
|
|
|
|
|
|
r0 += 2 + w; |
|
|
|
r1 += 2 + w; |
|
|
|
r2 += 2 + w; |
|
|
|
r3 += 2 + w; |
|
|
|
|
|
|
|
outptr0 += outw; |
|
|
|
outptr0n += outw; |
|
|
|
} |
|
|
|
|
|
|
|
int i = 0; |
|
|
|
for (; i < outh; i++) |
|
|
|
{ |
|
|
|
int remain = outw; |
|
|
|
int nn = outw >> 3; |
|
|
|
int remain = outw & 7; |
|
|
|
|
|
|
|
for (; nn >0; nn--) |
|
|
|
{ |
|
|
|
int8x8_t _r0 = vld1_s8(r0); |
|
|
|
int8x8_t _r0n = vld1_s8(r0+8); |
|
|
|
int8x8_t _r01 = vext_s8(_r0, _r0n, 1); |
|
|
|
int8x8_t _r02 = vext_s8(_r0, _r0n, 2); |
|
|
|
|
|
|
|
int16x8_t _sum0 = vmull_s8(_r0, _k0); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r01, _k1); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r02, _k2); |
|
|
|
|
|
|
|
int8x8_t _r1 = vld1_s8(r1); |
|
|
|
int8x8_t _r1n = vld1_s8(r1+8); |
|
|
|
int8x8_t _r11 = vext_s8(_r1, _r1n, 1); |
|
|
|
int8x8_t _r12 = vext_s8(_r1, _r1n, 2); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r1, _k3); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r11, _k4); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r12, _k5); |
|
|
|
|
|
|
|
int8x8_t _r2 = vld1_s8(r2); |
|
|
|
int8x8_t _r2n = vld1_s8(r2+8); |
|
|
|
int8x8_t _r21 = vext_s8(_r2, _r2n, 1); |
|
|
|
int8x8_t _r22 = vext_s8(_r2, _r2n, 2); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r2, _k6); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r21, _k7); |
|
|
|
_sum0 = vmlal_s8(_sum0, _r22, _k8); |
|
|
|
|
|
|
|
int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0)); |
|
|
|
int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0)); |
|
|
|
|
|
|
|
vst1q_s32(outptr0, sum0_s32); |
|
|
|
vst1q_s32(outptr0+4, sum0n_s32); |
|
|
|
|
|
|
|
r0 += 8; |
|
|
|
r1 += 8; |
|
|
|
r2 += 8; |
|
|
|
outptr0 += 8; |
|
|
|
} |
|
|
|
|
|
|
|
for (; remain > 0; remain--) |
|
|
|
for (; remain>0; remain--) |
|
|
|
{ |
|
|
|
int sum = 0; |
|
|
|
|
|
|
|
sum += (int)r0[0] * (int)kernel0[0]; |
|
|
|
sum += (int)r0[1] * (int)kernel0[1]; |
|
|
|
sum += (int)r0[2] * (int)kernel0[2]; |
|
|
|
sum += (int)r1[0] * (int)kernel0[3]; |
|
|
|
sum += (int)r1[1] * (int)kernel0[4]; |
|
|
|
sum += (int)r1[2] * (int)kernel0[5]; |
|
|
|
sum += (int)r2[0] * (int)kernel0[6]; |
|
|
|
sum += (int)r2[1] * (int)kernel0[7]; |
|
|
|
sum += (int)r2[2] * (int)kernel0[8]; |
|
|
|
sum += (int)r0[0] * kernel[0]; |
|
|
|
sum += (int)r0[1] * kernel[1]; |
|
|
|
sum += (int)r0[2] * kernel[2]; |
|
|
|
sum += (int)r1[0] * kernel[3]; |
|
|
|
sum += (int)r1[1] * kernel[4]; |
|
|
|
sum += (int)r1[2] * kernel[5]; |
|
|
|
sum += (int)r2[0] * kernel[6]; |
|
|
|
sum += (int)r2[1] * kernel[7]; |
|
|
|
sum += (int)r2[2] * kernel[8]; |
|
|
|
|
|
|
|
*outptr = sum; |
|
|
|
*outptr0 = sum; |
|
|
|
|
|
|
|
r0++; |
|
|
|
r1++; |
|
|
|
r2++; |
|
|
|
outptr++; |
|
|
|
} |
|
|
|
outptr0++; |
|
|
|
} |
|
|
|
|
|
|
|
r0 += 2; |
|
|
|
r1 += 2; |
|
|
|
@@ -82,42 +252,95 @@ static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const M |
|
|
|
int outh = top_blob.h; |
|
|
|
int outch = top_blob.c; |
|
|
|
|
|
|
|
const int tailstep = w - 2 * outw + w; |
|
|
|
const int tailstep = w - 2*outw + w; |
|
|
|
|
|
|
|
#pragma omp parallel for num_threads(opt.num_threads) |
|
|
|
for (int p = 0; p < outch; p++) |
|
|
|
for (int p=0; p<outch; p++) |
|
|
|
{ |
|
|
|
Mat out = top_blob.channel(p); |
|
|
|
|
|
|
|
const signed char *kernel0 = (const signed char *)_kernel + p * 9; |
|
|
|
const signed char* kernel = (const signed char*)_kernel + p*9; |
|
|
|
|
|
|
|
int *outptr = out; |
|
|
|
int* outptr = out; |
|
|
|
|
|
|
|
const signed char *img0 = bottom_blob.channel(p); |
|
|
|
const signed char* img = bottom_blob.channel(p); |
|
|
|
|
|
|
|
const signed char *r0 = img0; |
|
|
|
const signed char *r1 = img0 + w; |
|
|
|
const signed char *r2 = img0 + w * 2; |
|
|
|
const signed char* r0 = img; |
|
|
|
const signed char* r1 = img + w; |
|
|
|
const signed char* r2 = img + w*2; |
|
|
|
|
|
|
|
int i = 0; |
|
|
|
|
|
|
|
int8x8_t _k0 = vdup_n_s8(kernel[0]); |
|
|
|
int8x8_t _k1 = vdup_n_s8(kernel[1]); |
|
|
|
int8x8_t _k2 = vdup_n_s8(kernel[2]); |
|
|
|
int8x8_t _k3 = vdup_n_s8(kernel[3]); |
|
|
|
int8x8_t _k4 = vdup_n_s8(kernel[4]); |
|
|
|
int8x8_t _k5 = vdup_n_s8(kernel[5]); |
|
|
|
int8x8_t _k6 = vdup_n_s8(kernel[6]); |
|
|
|
int8x8_t _k7 = vdup_n_s8(kernel[7]); |
|
|
|
int8x8_t _k8 = vdup_n_s8(kernel[8]); |
|
|
|
|
|
|
|
for (; i < outh; i++) |
|
|
|
{ |
|
|
|
int remain = outw; |
|
|
|
{ |
|
|
|
int nn = outw >> 3; |
|
|
|
int remain = outw & 7; |
|
|
|
|
|
|
|
for (; remain > 0; remain--) |
|
|
|
for (; nn > 0; nn--) |
|
|
|
{ |
|
|
|
int sum = 0; |
|
|
|
int8x8x2_t _r0 = vld2_s8(r0); |
|
|
|
int8x8x2_t _r0n = vld2_s8(r0+16); |
|
|
|
int8x8_t _r00 = _r0.val[0]; |
|
|
|
int8x8_t _r01 = _r0.val[1]; |
|
|
|
int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1); |
|
|
|
|
|
|
|
int16x8_t _sum = vmull_s8(_r00, _k0); |
|
|
|
_sum = vmlal_s8(_sum, _r01, _k1); |
|
|
|
_sum = vmlal_s8(_sum, _r02, _k2); |
|
|
|
|
|
|
|
int8x8x2_t _r1 = vld2_s8(r1); |
|
|
|
int8x8x2_t _r1n = vld2_s8(r1+16); |
|
|
|
int8x8_t _r10 = _r1.val[0]; |
|
|
|
int8x8_t _r11 = _r1.val[1]; |
|
|
|
int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1); |
|
|
|
_sum = vmlal_s8(_sum, _r10, _k3); |
|
|
|
_sum = vmlal_s8(_sum, _r11, _k4); |
|
|
|
_sum = vmlal_s8(_sum, _r12, _k5); |
|
|
|
|
|
|
|
int8x8x2_t _r2 = vld2_s8(r2); |
|
|
|
int8x8x2_t _r2n = vld2_s8(r2+16); |
|
|
|
int8x8_t _r20 = _r2.val[0]; |
|
|
|
int8x8_t _r21 = _r2.val[1]; |
|
|
|
int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1); |
|
|
|
_sum = vmlal_s8(_sum, _r20, _k6); |
|
|
|
_sum = vmlal_s8(_sum, _r21, _k7); |
|
|
|
_sum = vmlal_s8(_sum, _r22, _k8); |
|
|
|
|
|
|
|
int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum)); |
|
|
|
int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum)); |
|
|
|
|
|
|
|
vst1q_s32(outptr, sum0_s32); |
|
|
|
vst1q_s32(outptr+4, sum0n_s32); |
|
|
|
|
|
|
|
r0 += 16; |
|
|
|
r1 += 16; |
|
|
|
r2 += 16; |
|
|
|
outptr += 8; |
|
|
|
} |
|
|
|
|
|
|
|
sum += (int)r0[0] * (int)kernel0[0]; |
|
|
|
sum += (int)r0[1] * (int)kernel0[1]; |
|
|
|
sum += (int)r0[2] * (int)kernel0[2]; |
|
|
|
sum += (int)r1[0] * (int)kernel0[3]; |
|
|
|
sum += (int)r1[1] * (int)kernel0[4]; |
|
|
|
sum += (int)r1[2] * (int)kernel0[5]; |
|
|
|
sum += (int)r2[0] * (int)kernel0[6]; |
|
|
|
sum += (int)r2[1] * (int)kernel0[7]; |
|
|
|
sum += (int)r2[2] * (int)kernel0[8]; |
|
|
|
for (; remain>0; remain--) |
|
|
|
{ |
|
|
|
int sum = 0; |
|
|
|
|
|
|
|
sum += (int)r0[0] * kernel[0]; |
|
|
|
sum += (int)r0[1] * kernel[1]; |
|
|
|
sum += (int)r0[2] * kernel[2]; |
|
|
|
sum += (int)r1[0] * kernel[3]; |
|
|
|
sum += (int)r1[1] * kernel[4]; |
|
|
|
sum += (int)r1[2] * kernel[5]; |
|
|
|
sum += (int)r2[0] * kernel[6]; |
|
|
|
sum += (int)r2[1] * kernel[7]; |
|
|
|
sum += (int)r2[2] * kernel[8]; |
|
|
|
|
|
|
|
*outptr = sum; |
|
|
|
|
|
|
|
|